Add GpuBuffer class (#423)

* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
2026-06-08 15:30:41 +00:00 · 2025-01-07 18:40:01 -08:00
parent 6d26b92665
commit 34945fb107
38 changed files with 527 additions and 555 deletions
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -35,7 +35,7 @@ jobs:
        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
        make -j
      workingDirectory: '$(System.DefaultWorkingDirectory)'
-  
+
  - task: DownloadSecureFile@1
    name: SshKeyFile
    displayName: Download key file
--- a/apps/nccl/src/broadcast.hpp
+++ b/apps/nccl/src/broadcast.hpp
@@ -15,7 +15,7 @@
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
    broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-               size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
+               [[maybe_unused]] size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
               size_t nRanksPerNode, size_t nelemsPerGPU) {
  const size_t nThread = blockDim.x * gridDim.x;
  const size_t nPeer = nRanksPerNode - 1;
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -176,9 +176,9 @@ static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannel
  std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles),
                 [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
-      mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
-  mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
-                                                                 smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
+      mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
+  mscclpp::gpuMemcpy<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
+                                                                smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
  return ptr;
 }

@@ -360,7 +360,7 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt
  commPtr->smSemaphores = std::move(smSemaphores);
  commPtr->buffFlag = 0;
  commPtr->numScratchBuff = 2;
-  commPtr->scratchBuff = mscclpp::allocExtSharedCuda<char>(SCRATCH_SIZE);
+  commPtr->scratchBuff = mscclpp::GpuBuffer(SCRATCH_SIZE).memory();
  commPtr->remoteScratchRegMemories =
      setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
 }
@@ -624,7 +624,6 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
  }

  int rank = comm->comm->bootstrap()->getRank();
-  int nRank = comm->comm->bootstrap()->getNranks();

  std::vector<executionPlanInstance>& plans = comm->executionPlans["broadcast"];
  std::shared_ptr<mscclpp::ExecutionPlan> plan;
@@ -817,18 +816,13 @@ NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
 }

 ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
-  // Allocate memory using mscclpp::allocSharedPhysicalCuda
  if (ptr == nullptr || size == 0) {
    WARN("ptr is nullptr or size is 0");
    return ncclInvalidArgument;
  }
  std::shared_ptr<char> sharedPtr;
  try {
-    if (mscclpp::isNvlsSupported()) {
-      sharedPtr = mscclpp::allocSharedPhysicalCuda<char>(size);
-    } else {
-      sharedPtr = mscclpp::allocExtSharedCuda<char>(size);
-    }
+    sharedPtr = mscclpp::GpuBuffer(size).memory();
    if (sharedPtr == nullptr) {
      INFO(MSCCLPP_ALLOC, "Failed to allocate memory");
      return ncclSystemError;
--- a/docs/getting-started/tutorials/python-api.md
+++ b/docs/getting-started/tutorials/python-api.md
@@ -14,6 +14,7 @@ from mscclpp import (
    ProxyService,
    Transport,
 )
+from mscclpp.utils import GpuBuffer
 import mscclpp.comm as mscclpp_comm

 def create_connection(group: mscclpp_comm.CommGroup, transport: str):
@@ -32,7 +33,7 @@ if __name__ == "__main__":
    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
    connections = create_connection(mscclpp_group, "NVLink")
    nelems = 1024
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    proxy_service = ProxyService()
    simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
    proxy_service.start_proxy()
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -4,7 +4,6 @@
 #ifndef MSCCLPP_GPU_UTILS_HPP_
 #define MSCCLPP_GPU_UTILS_HPP_

-#include <cstring>
 #include <memory>

 #include "errors.hpp"
@@ -35,19 +34,6 @@

 namespace mscclpp {

-/// set memory access permission to read-write
-/// @param base Base memory pointer.
-/// @param size Size of the memory.
-inline void setReadWriteMemoryAccess(void* base, size_t size) {
-  CUmemAccessDesc accessDesc = {};
-  int deviceId;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
-  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  accessDesc.location.id = deviceId;
-  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
-}
-
 /// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
 /// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
 struct AvoidCudaGraphCaptureGuard {
@@ -64,101 +50,46 @@ struct CudaStreamWithFlags {
  cudaStream_t stream_;
 };

-template <class T>
-struct CudaDeleter;
-
 namespace detail {

-/// A wrapper of cudaMalloc that sets the allocated memory to zero.
-/// @tparam T Type of each element in the allocated memory.
-/// @param nelem Number of elements to allocate.
-/// @return A pointer to the allocated memory.
-template <class T>
-T* cudaCalloc(size_t nelem) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  T* ptr;
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
-  return ptr;
-}
+void setReadWriteMemoryAccess(void* base, size_t size);

-#if (CUDA_NVLS_SUPPORTED)
-template <class T>
-T* cudaPhysicalCalloc(size_t nelems, size_t gran) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  int deviceId = -1;
-  CUdevice currentDevice;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
-  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
-
-  CUmemAllocationProp prop = {};
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.requestedHandleTypes =
-      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
-  prop.location.id = currentDevice;
-
-  // allocate physical memory
-  CUmemGenericAllocationHandle memHandle;
-  size_t nbytes = (nelems * sizeof(T) + gran - 1) / gran * gran;
-  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
-
-  T* devicePtr = nullptr;
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
-  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
-  setReadWriteMemoryAccess(devicePtr, nbytes);
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
-
-  return devicePtr;
-}
-#endif
-
-template <class T>
-T* cudaExtCalloc(size_t nelem) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  T* ptr;
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+void* gpuCalloc(size_t bytes);
+void* gpuCallocHost(size_t bytes);
 #if defined(__HIP_PLATFORM_AMD__)
-  MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, nelem * sizeof(T), hipDeviceMallocUncached));
-#else
-  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
-#endif
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
-  return ptr;
-}
+void* gpuCallocUncached(size_t bytes);
+#endif  // defined(__HIP_PLATFORM_AMD__)
+#if (CUDA_NVLS_SUPPORTED)
+void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0);
+#endif  // CUDA_NVLS_SUPPORTED

-/// A wrapper of cudaHostAlloc that sets the allocated memory to zero.
-/// @tparam T Type of each element in the allocated memory.
-/// @param nelem Number of elements to allocate.
-/// @return A pointer to the allocated memory.
-template <class T>
-T* cudaHostCalloc(size_t nelem) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  T* ptr;
-  MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined));
-  memset(ptr, 0, nelem * sizeof(T));
-  return ptr;
-}
+void gpuFree(void* ptr);
+void gpuFreeHost(void* ptr);
+#if (CUDA_NVLS_SUPPORTED)
+void gpuFreePhysical(void* ptr);
+#endif  // CUDA_NVLS_SUPPORTED
+
+void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
+                    cudaMemcpyKind kind = cudaMemcpyDefault);
+void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);

 /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
 /// destroyed.
 /// @tparam T Type of each element in the allocated memory.
-/// @tparam alloc A function that allocates memory.
 /// @tparam Deleter A deleter that will be used to free the allocated memory.
 /// @tparam Memory The type of the returned object.
-/// @param nelem Number of elements to allocate.
+/// @tparam Alloc A function type that allocates memory.
+/// @tparam Args Input types of the @p alloc function variables.
+/// @param alloc A function that allocates memory.
+/// @param nelems Number of elements to allocate.
+/// @param args Extra input variables for the @p alloc function.
 /// @return An object of type @p Memory that will free the allocated memory when destroyed.
 ///
-template <class T, T*(alloc)(size_t), class Deleter, class Memory>
-Memory safeAlloc(size_t nelem) {
+template <class T, class Deleter, class Memory, typename Alloc, typename... Args>
+Memory safeAlloc(Alloc alloc, size_t nelems, Args&&... args) {
  T* ptr = nullptr;
  try {
-    ptr = alloc(nelem);
+    ptr = reinterpret_cast<T*>(alloc(nelems * sizeof(T), std::forward<Args>(args)...));
  } catch (...) {
    if (ptr) {
      Deleter()(ptr);
@@ -168,258 +99,164 @@ Memory safeAlloc(size_t nelem) {
  return Memory(ptr, Deleter());
 }

-template <class T, T*(alloc)(size_t, size_t), class Deleter, class Memory>
-Memory safeAlloc(size_t nelem, size_t gran) {
-  if ((nelem * sizeof(T)) % gran) {
-    throw Error("The request allocation size is not divisible by the required granularity:" +
-                    std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
-                ErrorCode::InvalidUsage);
-  }
-  T* ptr = nullptr;
-  try {
-    ptr = alloc(nelem, gran);
-  } catch (...) {
-    if (ptr) {
-      Deleter()(ptr);
-    }
-    throw;
-  }
-  return Memory(ptr, Deleter());
+/// A deleter that calls gpuFree for use with std::unique_ptr or std::shared_ptr.
+/// @tparam T Type of each element in the allocated memory.
+template <class T = void>
+struct GpuDeleter {
+  void operator()(void* ptr) { gpuFree(ptr); }
+};
+
+/// A deleter that calls gpuFreeHost for use with std::unique_ptr or std::shared_ptr.
+/// @tparam T Type of each element in the allocated memory.
+template <class T = void>
+struct GpuHostDeleter {
+  void operator()(void* ptr) { gpuFreeHost(ptr); }
+};
+
+#if (CUDA_NVLS_SUPPORTED)
+template <class T = void>
+struct GpuPhysicalDeleter {
+  void operator()(void* ptr) { gpuFreePhysical(ptr); }
+};
+#endif  // CUDA_NVLS_SUPPORTED
+
+template <class T>
+using UniqueGpuPtr = std::unique_ptr<T, detail::GpuDeleter<T>>;
+
+template <class T>
+using UniqueGpuHostPtr = std::unique_ptr<T, detail::GpuHostDeleter<T>>;
+
+template <class T>
+auto gpuCallocShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, std::shared_ptr<T>>(detail::gpuCalloc, nelems);
 }

+template <class T>
+auto gpuCallocUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(detail::gpuCalloc, nelems);
+}
+
+template <class T>
+auto gpuCallocHostShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocHost, nelems);
+}
+
+template <class T>
+auto gpuCallocHostUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(detail::gpuCallocHost, nelems);
+}
+
+#if defined(__HIP_PLATFORM_AMD__)
+
+template <class T>
+auto gpuCallocUncachedShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocUncached, nelems);
+}
+
+template <class T>
+auto gpuCallocUncachedUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(detail::gpuCallocUncached, nelems);
+}
+
+#endif  // defined(__HIP_PLATFORM_AMD__)
+
+#if (CUDA_NVLS_SUPPORTED)
+
+template <class T>
+using UniqueGpuPhysicalPtr = std::unique_ptr<T, detail::GpuPhysicalDeleter<T>>;
+
+template <class T>
+auto gpuCallocPhysicalShared(size_t nelems = 1, size_t gran = 0, size_t align = 0) {
+  return detail::safeAlloc<T, detail::GpuPhysicalDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocPhysical, nelems,
+                                                                                 gran, align);
+}
+
+template <class T>
+auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0, size_t align = 0) {
+  return detail::safeAlloc<T, detail::GpuPhysicalDeleter<T>, UniqueGpuPhysicalPtr<T>>(detail::gpuCallocPhysical, nelems,
+                                                                                      gran, align);
+}
+
+size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
+
+#endif  // CUDA_NVLS_SUPPORTED
+
 }  // namespace detail

-/// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr.
-/// @tparam T Type of each element in the allocated memory.
-template <class T>
-struct CudaDeleter {
-  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
-  void operator()(TPtrOrArray ptr) {
-    AvoidCudaGraphCaptureGuard cgcGuard;
-    MSCCLPP_CUDATHROW(cudaFree(ptr));
-  }
-};
-
-template <class T>
-struct CudaPhysicalDeleter {
-  static_assert(!std::is_array_v<T>, "T must not be an array");
-  void operator()(T* ptr) {
-    AvoidCudaGraphCaptureGuard cgcGuard;
-    CUmemGenericAllocationHandle handle;
-    size_t size = 0;
-    MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
-    MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
-    MSCCLPP_CUTHROW(cuMemRelease(handle));
-    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
-  }
-};
-
-/// A deleter that calls cudaFreeHost for use with std::unique_ptr or std::shared_ptr.
-/// @tparam T Type of each element in the allocated memory.
-template <class T>
-struct CudaHostDeleter {
-  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
-  void operator()(TPtrOrArray ptr) {
-    AvoidCudaGraphCaptureGuard cgcGuard;
-    MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
-  }
-};
-
-/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::shared_ptr to the allocated memory.
-template <class T>
-std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
+template <class T = char>
+void gpuMemcpyAsync(T* dst, const T* src, size_t nelems, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  detail::gpuMemcpyAsync(dst, src, nelems * sizeof(T), stream, kind);
 }

+template <class T = char>
+void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
+}
+
+bool isNvlsSupported();
+
+/// Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by
+/// `GpuBuffer::data()`.
+///
+/// Use this function for communication buffers, i.e., only when other devices (CPU, GPU, NIC, etc.) may access this
+/// memory space at the same time with the local device (GPU). Running heavy computation over this memory space
+/// may perform bad and is not recommended in general.
+///
+/// The allocated memory space is managed by the `memory_` object, not by the class instance. Which means,
+/// the class destructor will NOT free the allocated memory if `memory_` is shared with and alive in other contexts.
+///
+/// @tparam T Type of each element in the allocated memory. Default is `char`.
+///
+template <class T = char>
+class GpuBuffer {
+ public:
+  /// Constructs a GpuBuffer with the specified number of elements.
+  /// @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer.
+  GpuBuffer(size_t nelems) : nelems_(nelems) {
+    if (nelems == 0) {
+      bytes_ = 0;
+      return;
+    }
 #if (CUDA_NVLS_SUPPORTED)
-static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
-  size_t gran = 0;
-  int numDevices = 0;
-  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
+    if (isNvlsSupported()) {
+      size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
+      bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T);
+      memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
+      return;
+    }
+#endif  // CUDA_NVLS_SUPPORTED

-  CUmulticastObjectProp prop = {};
-  prop.size = size;
-  // This is a dummy value, it might affect the granularity in the future
-  prop.numDevices = numDevices;
-  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
-  prop.flags = 0;
-  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
-  return gran;
-}
-#endif
-
-/// Allocates physical memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @param gran the granularity of the allocation.
-/// @return A std::shared_ptr to the allocated memory.
-template <class T>
-std::shared_ptr<T> allocSharedPhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
-#if (CUDA_NVLS_SUPPORTED)
-  if (!isNvlsSupported()) {
-    throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
-  }
-  if (count == 0) {
-    return nullptr;
+    bytes_ = nelems * sizeof(T);
+#if defined(__HIP_PLATFORM_AMD__)
+    memory_ = detail::gpuCallocUncachedShared<T>(nelems);
+#else   // !defined(__HIP_PLATFORM_AMD__)
+    memory_ = detail::gpuCallocShared<T>(nelems);
+#endif  // !defined(__HIP_PLATFORM_AMD__)
  }

-  if (gran == 0) {
-    gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
-  }
-  size_t nelems = ((count * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
-  return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>, std::shared_ptr<T>>(nelems, gran);
-#else
-  throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
-#endif
-}
+  /// Returns the number of elements in the allocated memory.
+  /// @return The number of elements.
+  size_t nelems() const { return nelems_; }

-/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::shared_ptr to the allocated memory.
-template <class T>
-std::shared_ptr<T> allocExtSharedCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
-}
+  /// Returns the number of bytes that is actually allocated. This may be larger than `nelems() * sizeof(T)`.
+  /// @return The number of bytes.
+  size_t bytes() const { return bytes_; }

-/// Unique device pointer that will call cudaFree on destruction.
-/// @tparam T Type of each element in the allocated memory.
-template <class T>
-using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
+  /// Returns the shared pointer to the allocated memory.
+  /// If `nelems()` is zero, this function will return an empty shared pointer.
+  /// @return A `std::shared_ptr` to the allocated memory.
+  std::shared_ptr<T> memory() { return memory_; }

-/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T>
-UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
-}
+  /// Returns the device pointer to the allocated memory. Equivalent to `memory().get()`.
+  /// If `nelems()` is zero, this function will return a null pointer.
+  /// @return A device pointer to the allocated memory.
+  T* data() { return memory_.get(); }

-/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T>
-UniqueCudaPtr<T> allocExtUniqueCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
-}
-
-/// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @tparam Args Types of the arguments to pass to the constructor.
-/// @param args Arguments to pass to the constructor.
-/// @return A std::shared_ptr to the allocated memory.
-template <class T, typename... Args>
-std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
-  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
-  new (ptr.get()) T(std::forward<Args>(args)...);
-  return ptr;
-}
-
-/// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
-/// std::shared_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @param count Number of elements to allocate.
-/// @return A std::shared_ptr to the allocated memory.
-template <class T>
-std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
-  using TElem = std::remove_extent_t<T>;
-  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
-  for (size_t i = 0; i < count; ++i) {
-    new (&ptr[i]) TElem();
-  }
-  return ptr;
-}
-
-/// Unique CUDA host pointer that will call cudaFreeHost on destruction.
-/// @tparam T Type of each element in the allocated memory.
-template <class T>
-using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
-
-/// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @tparam Args Types of the arguments to pass to the constructor.
-/// @param args Arguments to pass to the constructor.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
-UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
-  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
-  new (ptr.get()) T(std::forward<Args>(args)...);
-  return ptr;
-}
-
-/// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
-/// std::unique_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @param count Number of elements to allocate.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
-UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
-  using TElem = std::remove_extent_t<T>;
-  auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
-  for (size_t i = 0; i < count; ++i) {
-    new (&ptr[i]) TElem();
-  }
-  return ptr;
-}
-
-/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
-/// The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @param gran the granularity of the allocation.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T>
-std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
-#if (CUDA_NVLS_SUPPORTED)
-  if (!isNvlsSupported()) {
-    throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
-  }
-  if (count == 0) {
-    return nullptr;
-  }
-
-  if (gran == 0) {
-    gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
-  }
-  return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
-                           std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
-#else
-  throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
-#endif
-}
-
-/// Asynchronous cudaMemcpy without capture into a CUDA graph.
-/// @tparam T Type of each element in the allocated memory.
-/// @param dst Destination pointer.
-/// @param src Source pointer.
-/// @param count Number of elements to copy.
-/// @param stream CUDA stream to use.
-/// @param kind Type of cudaMemcpy to perform.
-template <class T>
-void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
-}
-
-/// Synchronous cudaMemcpy without capture into a CUDA graph.
-/// @tparam T Type of each element in the allocated memory.
-/// @param dst Destination pointer.
-/// @param src Source pointer.
-/// @param count Number of elements to copy.
-/// @param kind Type of cudaMemcpy to perform.
-template <class T>
-void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
-}
+ private:
+  size_t nelems_;
+  size_t bytes_;
+  std::shared_ptr<T> memory_;
+};

 }  // namespace mscclpp

--- a/include/mscclpp/npkit/npkit.hpp
+++ b/include/mscclpp/npkit/npkit.hpp
@@ -83,18 +83,18 @@ class NpKit {
  // 64K * 2 (send/recv) * (1024/64) = 2M, 2M * 64 * 16B = 2GB per CPU
  static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;

-  static std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> gpu_event_buffers_;
+  static std::vector<mscclpp::detail::UniqueGpuPtr<NpKitEvent>> gpu_event_buffers_;
  static std::vector<std::unique_ptr<NpKitEvent[]>> cpu_event_buffers_;

-  static mscclpp::UniqueCudaPtr<NpKitEventCollectContext> gpu_collect_contexts_;
+  static mscclpp::detail::UniqueGpuPtr<NpKitEventCollectContext> gpu_collect_contexts_;
  static std::unique_ptr<NpKitEventCollectContext[]> cpu_collect_contexts_;

  static uint64_t rank_;

 #if defined(__HIP_PLATFORM_AMD__)
-  static mscclpp::UniqueCudaHostPtr<uint64_t[]> cpu_timestamp_;
+  static mscclpp::detail::UniqueGpuHostPtr<uint64_t[]> cpu_timestamp_;
 #else
-  static mscclpp::UniqueCudaHostPtr<uint64_t> cpu_timestamp_;
+  static mscclpp::detail::UniqueGpuHostPtr<uint64_t> cpu_timestamp_;
 #endif
  static std::unique_ptr<std::thread> cpu_timestamp_update_thread_;
  static volatile bool cpu_timestamp_update_thread_should_stop_;
--- a/include/mscclpp/nvls.hpp
+++ b/include/mscclpp/nvls.hpp
@@ -16,9 +16,6 @@ class NvlsConnection {
  NvlsConnection() = delete;
  std::vector<char> serialize();

-  // the recommended buffer size for NVLS, returned by cuMulticastGetGranularity
-  static const int DefaultNvlsBufferSize;
-
  // Everyone needs to synchronize after creating a NVLS connection before adding devices
  void addDevice();
  void addDevice(int cudaDeviceId);
@@ -39,10 +36,10 @@ class NvlsConnection {
    friend class NvlsConnection;
  };

-  /// @brief bind the allocated memory via @ref mscclpp::allocSharedPhysicalCuda to the multicast handle. The behavior
-  /// is undefined if the devicePtr is not allocated by @ref mscclpp::allocSharedPhysicalCuda.
-  /// @param devicePtr
-  /// @param size
+  /// @brief bind the memory allocated via @ref mscclpp::GpuBuffer to the multicast handle. The behavior
+  /// is undefined if the devicePtr is not allocated by @ref mscclpp::GpuBuffer.
+  /// @param devicePtr The device pointer returned by `mscclpp::GpuBuffer::data()`.
+  /// @param size The bytes of the memory to bind to the multicast handle.
  /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
  DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);

@@ -65,7 +62,7 @@ class Communicator;
 /// @param config The configuration for the local endpoint.
 /// @return std::shared_ptr<NvlsConnection> A shared pointer to the NVLS connection.
 std::shared_ptr<NvlsConnection> connectNvlsCollective(std::shared_ptr<Communicator> comm, std::vector<int> allRanks,
-                                                      size_t bufferSize = NvlsConnection::DefaultNvlsBufferSize);
+                                                      size_t bufferSize);

 }  // namespace mscclpp

--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -64,7 +64,7 @@ class BaseSemaphore {
 };

 /// A semaphore for sending signals from the host to the device.
-class Host2DeviceSemaphore : public BaseSemaphore<CudaDeleter, std::default_delete> {
+class Host2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, std::default_delete> {
 private:
  std::shared_ptr<Connection> connection_;

@@ -117,7 +117,7 @@ class Host2HostSemaphore : public BaseSemaphore<std::default_delete, std::defaul
 };

 /// A semaphore for sending signals from the local device to a peer device via SM.
-class SmDevice2DeviceSemaphore : public BaseSemaphore<CudaDeleter, CudaDeleter> {
+class SmDevice2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, detail::GpuDeleter> {
 public:
  /// Constructor.
  /// @param communicator The communicator.
--- a/include/mscclpp/utils.hpp
+++ b/include/mscclpp/utils.hpp
@@ -37,8 +37,6 @@ struct ScopedTimer : public Timer {

 std::string getHostName(int maxlen, const char delim);

-bool isNvlsSupported();
-
 }  // namespace mscclpp

 #endif  // MSCCLPP_UTILS_HPP_
--- a/python/mscclpp/init.py
+++ b/python/mscclpp/init.py
@@ -24,9 +24,9 @@ from ._mscclpp import (
    Executor,
    ExecutionPlan,
    PacketType,
+    RawGpuBuffer,
    version,
    is_nvls_supported,
-    alloc_shared_physical_cuda,
    npkit,
 )

--- a/python/mscclpp/gpu_utils_py.cpp
+++ b/python/mscclpp/gpu_utils_py.cpp
@@ -1,30 +1,18 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/shared_ptr.h>

-// #include <memory>
 #include <mscclpp/gpu_data_types.hpp>
 #include <mscclpp/gpu_utils.hpp>

 namespace nb = nanobind;
 using namespace mscclpp;

-class PyCudaMemory {
- public:
-  PyCudaMemory(size_t size) : size_(size) { ptr_ = allocSharedPhysicalCuda<char>(size); }
-
-  uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
-  size_t size() const { return size_; }
-
- private:
-  std::shared_ptr<char> ptr_;
-  size_t size_;
-};
-
 void register_gpu_utils(nb::module_& m) {
-  nb::class_<PyCudaMemory>(m, "PyCudaMemory")
-      .def(nb::init<size_t>(), nb::arg("size"))
-      .def("get_ptr", &PyCudaMemory::getPtr, "Get the raw pointer")
-      .def("size", &PyCudaMemory::size, "Get the size of the allocated memory");
-  m.def(
-      "alloc_shared_physical_cuda", [](size_t size) { return std::make_shared<PyCudaMemory>(size); }, nb::arg("size"));
+  m.def("is_nvls_supported", &isNvlsSupported);
+
+  nb::class_<GpuBuffer<char>>(m, "RawGpuBuffer")
+      .def(nb::init<size_t>(), nb::arg("nelems"))
+      .def("nelems", &GpuBuffer<char>::nelems)
+      .def("bytes", &GpuBuffer<char>::bytes)
+      .def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); });
 }
--- a/python/mscclpp/nvls_py.cpp
+++ b/python/mscclpp/nvls_py.cpp
@@ -34,5 +34,5 @@ void register_nvls(nb::module_& m) {
      .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);

  m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),
-        nb::arg("bufferSize") = NvlsConnection::DefaultNvlsBufferSize);
+        nb::arg("bufferSize"));
 }
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -6,10 +6,11 @@ import os
 import struct
 import subprocess
 import tempfile
-from typing import Any, Type
+from typing import Any, Type, Union, Tuple

 import cupy as cp
 import numpy as np
+from ._mscclpp import RawGpuBuffer

 try:
    import torch
@@ -36,7 +37,7 @@ class Kernel:
        nblocks: int,
        nthreads: int,
        shared: int,
-        stream: Type[cp.cuda.Stream] or Type[None],
+        stream: Union[cp.cuda.Stream, None],
    ):
        buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
        buffer_size = ctypes.c_size_t(len(params))
@@ -137,6 +138,25 @@ class KernelBuilder:
            self._tempdir.cleanup()


+class GpuBuffer(cp.ndarray):
+    def __new__(
+        cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
+    ):
+        # Check if `shape` is valid
+        if isinstance(shape, int):
+            shape = (shape,)
+        try:
+            shape = tuple(shape)
+        except TypeError:
+            raise ValueError("Shape must be a tuple-like or an integer.")
+        if any(s <= 0 for s in shape):
+            raise ValueError("Shape must be positive.")
+        # Create the buffer
+        buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
+        return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
+
+
 def pack(*args):
    res = b""
    for arg in list(args):
--- a/python/mscclpp/utils_py.cpp
+++ b/python/mscclpp/utils_py.cpp
@@ -20,5 +20,4 @@ void register_utils(nb::module_& m) {
  nb::class_<ScopedTimer, Timer>(m, "ScopedTimer").def(nb::init<std::string>(), nb::arg("name"));

  m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim"));
-  m.def("is_nvls_supported", &isNvlsSupported);
 }
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -15,6 +15,7 @@ from mpi4py import MPI
 import cupy.cuda.nccl as nccl
 import mscclpp.comm as mscclpp_comm
 from mscclpp import ProxyService, is_nvls_supported
+from mscclpp.utils import GpuBuffer
 from prettytable import PrettyTable
 import netifaces as ni
 import ipaddress
@@ -162,8 +163,8 @@ def find_best_config(mscclpp_call, niter):
 def run_benchmark(
    mscclpp_group: mscclpp_comm.CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int
 ):
-    memory = cp.zeros(nelem, dtype=data_type)
-    memory_out = cp.zeros(nelem, dtype=data_type)
+    memory = GpuBuffer(nelem, dtype=data_type)
+    memory_out = GpuBuffer(nelem, dtype=data_type)
    cp.cuda.runtime.deviceSynchronize()

    proxy_service = ProxyService()
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -1,9 +1,9 @@
 import os
 import cupy as cp
 import ctypes
-from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore, alloc_shared_physical_cuda
+from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack


 IB_TRANSPORTS = [
@@ -115,7 +115,7 @@ class MscclppAllReduce2:
        self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
        type_str = type_to_str(memory.dtype)

-        self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
        # create a sm_channel for each remote neighbor
        self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, self.connections)
        file_dir = os.path.dirname(os.path.abspath(__file__))
@@ -179,7 +179,7 @@ class MscclppAllReduce3:
        type_str = type_to_str(memory.dtype)

        self.proxy_service = proxy_service
-        self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)

        # create a sm_channel for each remote neighbor
        self.fst_round_proxy_chans = self.group.make_proxy_channels_with_scratch(
@@ -259,7 +259,7 @@ class MscclppAllReduce4:
        type_str = type_to_str(memory.dtype)

        self.proxy_service = proxy_service
-        self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
        same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
        # create a sm_channel for each remote neighbor
        self.sm_channels = self.group.make_sm_channels(self.memory, same_node_connections)
@@ -362,8 +362,8 @@ class MscclppAllReduce5:
        type_str = type_to_str(memory.dtype)

        self.proxy_service = proxy_service
-        self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
-        self.put_buff = cp.zeros(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
+        self.put_buff = GpuBuffer(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
        same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
        across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)}
        # create a sm_channel for each remote neighbor
@@ -441,18 +441,10 @@ class MscclppAllReduce6:
        # create a connection for each remote neighbor
        self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
        self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
-        min_gran = self.nvls_connection.get_multicast_min_granularity()
-        aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
-        buffer_raw = alloc_shared_physical_cuda(aligned_buffer_size)
+        self.memory = GpuBuffer(nelem, memory_dtype)
        self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
-            buffer_raw.get_ptr(), aligned_buffer_size
-        )  # just using recommended size for now
-        self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
-
-        self.cp_memory_ptr = cp.cuda.MemoryPointer(
-            cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, buffer_raw), 0
+            self.memory.data.ptr, self.memory.data.mem.size
        )
-        self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)

        # create a sm_channel for each remote neighbor
        self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore)
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -8,11 +8,9 @@ from mscclpp import (
    ExecutionPlan,
    PacketType,
    npkit,
-    alloc_shared_physical_cuda,
-    is_nvls_supported,
 )
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 import os
 import struct

@@ -129,18 +127,6 @@ def dtype_to_mscclpp_dtype(dtype):
        raise ValueError(f"Unknown data type: {dtype}")


-def allocate_buffer(nelems, dtype):
-    if is_nvls_supported():
-        buffer_raw = alloc_shared_physical_cuda(nelems * cp.dtype(dtype).itemsize)
-        buffer_ptr = cp.cuda.MemoryPointer(
-            cp.cuda.UnownedMemory(buffer_raw.get_ptr(), buffer_raw.size(), buffer_raw), 0
-        )
-        buffer = cp.ndarray(nelems, dtype=dtype, memptr=buffer_ptr)
-        return buffer
-    else:
-        return cp.zeros(nelems, dtype=dtype)
-
-
 def build_bufs(
    collective: str,
    size: int,
@@ -160,14 +146,14 @@ def build_bufs(
        nelems_input = nelems
    nelems_output = nelems

-    result_buf = allocate_buffer(nelems_output, dtype=dtype)
+    result_buf = GpuBuffer(nelems_output, dtype=dtype)
    if in_place:
        if "allgather" in collective:
            input_buf = cp.split(result_buf, num_ranks)[rank]
        else:
            input_buf = result_buf
    else:
-        input_buf = allocate_buffer(nelems_input, dtype=dtype)
+        input_buf = GpuBuffer(nelems_input, dtype=dtype)
    test_buf = cp.zeros(nelems_output, dtype=dtype)

    return input_buf, result_buf, test_buf
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -27,7 +27,7 @@ from mscclpp import (
    npkit,
 )
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 from ._cpp import _ext
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group

@@ -156,12 +156,26 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
    create_group_and_connection(mpi_group, transport)


+@parametrize_mpi_groups(1)
+@pytest.mark.parametrize("nelem", [2**i for i in [0, 10, 15, 20]])
+@pytest.mark.parametrize("dtype", [cp.float32, cp.float16])
+def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
+    memory = GpuBuffer(nelem, dtype=dtype)
+    assert memory.shape == (nelem,)
+    assert memory.dtype == dtype
+    assert memory.itemsize == cp.dtype(dtype).itemsize
+    assert memory.nbytes == nelem * cp.dtype(dtype).itemsize
+    assert memory.data.ptr != 0
+    assert memory.data.mem.ptr != 0
+    assert memory.data.mem.size >= nelem * cp.dtype(dtype).itemsize
+
+
@parametrize_mpi_groups(2, 4, 8, 16)
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
    group, connections = create_group_and_connection(mpi_group, transport)
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    nelemPerRank = nelem // group.nranks
    sizePerRank = nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -436,13 +450,12 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
    group, connections = create_group_and_connection(mpi_group, "NVLink")

-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    if use_packet:
-        scratch = cp.zeros(nelem * 2, dtype=cp.int32)
+        scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
    else:
        scratch = None
    nelemPerRank = nelem // group.nranks
-    nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
    memory_expected = cp.zeros_like(memory)
    for rank in range(group.nranks):
@@ -484,7 +497,7 @@ def test_fifo(
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
    group, connections = create_group_and_connection(mpi_group, transport)

-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    nelemPerRank = nelem // group.nranks
    nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -534,11 +547,11 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
    group, connections = create_group_and_connection(mpi_group, transport)

-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    if use_packet:
-        scratch = cp.zeros(nelem * 2, dtype=cp.int32)
+        scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
    else:
-        scratch = cp.zeros(1, dtype=cp.int32)  # just so that we can pass a valid ptr
+        scratch = GpuBuffer(1, dtype=cp.int32)  # just so that we can pass a valid ptr
    nelemPerRank = nelem // group.nranks
    nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -301,8 +301,8 @@ void EthernetConnection::write(RegisteredMemory dst, uint64_t dstOffset, Registe
    uint64_t dataSize =
        std::min(sendBufferSize_ - headerSize / sizeof(char), (size - sentDataSize) / sizeof(char)) * sizeof(char);
    uint64_t messageSize = dataSize + headerSize;
-    mscclpp::memcpyCuda<char>(sendBuffer_.data() + headerSize / sizeof(char),
-                              (char*)srcPtr + (sentDataSize / sizeof(char)), dataSize, cudaMemcpyDeviceToHost);
+    mscclpp::gpuMemcpy(sendBuffer_.data() + headerSize / sizeof(char), srcPtr + (sentDataSize / sizeof(char)), dataSize,
+                       cudaMemcpyDeviceToHost);
    sendSocket_->send(sendBuffer_.data(), messageSize);
    sentDataSize += messageSize;
    headerSize = 0;
@@ -402,8 +402,7 @@ void EthernetConnection::recvMessages() {
      received &= !closed;

      if (received)
-        mscclpp::memcpyCuda<char>((char*)ptr + (recvSize / sizeof(char)), recvBuffer_.data(), messageSize,
-                                  cudaMemcpyHostToDevice);
+        mscclpp::gpuMemcpy(ptr + (recvSize / sizeof(char)), recvBuffer_.data(), messageSize, cudaMemcpyHostToDevice);
      recvSize += messageSize;
    }

--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -155,10 +155,10 @@ struct Executor::Impl {
      plan.impl_->lightLoadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
      this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
      this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
-          allocExtSharedCuda<char>(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
-      memcpyCuda(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
-                 (char*)devicePlans[devicePlanKey].data(),
-                 devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
+          GpuBuffer(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
+      gpuMemcpy(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
+                (char*)devicePlans[devicePlanKey].data(),
+                devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
      this->contexts[key].currentDevicePlan = devicePlanKey;
      return this->contexts[key];
    }
@@ -170,12 +170,7 @@ struct Executor::Impl {
    size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
    size_t scratchBufferSize =
        std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange, recvMemRange), maxScratchBufferSize);
-    std::shared_ptr<char> scratchBuffer;
-    if (isNvlsSupported()) {
-      scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
-    } else {
-      scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
-    }
+    std::shared_ptr<char> scratchBuffer = GpuBuffer(scratchBufferSize).memory();
    context.scratchBuffer = scratchBuffer;
    context.scratchBufferSize = scratchBufferSize;
    context.proxyService = std::make_shared<ProxyService>();
@@ -186,11 +181,10 @@ struct Executor::Impl {
    this->setupNvlsChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
    this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
    context.deviceExecutionPlansBuffers[devicePlanKey] =
-        allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
-    memcpyCuda(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
-               (char*)context.deviceExecutionPlans[devicePlanKey].data(),
-               context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan),
-               cudaMemcpyHostToDevice);
+        GpuBuffer(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
+    gpuMemcpy(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
+              (char*)context.deviceExecutionPlans[devicePlanKey].data(),
+              context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
    context.currentDevicePlan = devicePlanKey;
    context.proxyService->startProxy();
    this->contexts.insert({key, context});
--- a/src/fifo.cc
+++ b/src/fifo.cc
@@ -10,9 +10,9 @@
 namespace mscclpp {

 struct Fifo::Impl {
-  UniqueCudaHostPtr<ProxyTrigger[]> triggers;
-  UniqueCudaPtr<uint64_t> head;
-  UniqueCudaPtr<uint64_t> tailReplica;
+  detail::UniqueGpuHostPtr<ProxyTrigger> triggers;
+  detail::UniqueGpuPtr<uint64_t> head;
+  detail::UniqueGpuPtr<uint64_t> tailReplica;
  const int size;

  // allocated on the host. Only accessed by the host. This is a copy of the
@@ -28,9 +28,9 @@ struct Fifo::Impl {
  CudaStreamWithFlags stream;

  Impl(int size)
-      : triggers(makeUniqueCudaHost<ProxyTrigger[]>(size)),
-        head(allocUniqueCuda<uint64_t>()),
-        tailReplica(allocUniqueCuda<uint64_t>()),
+      : triggers(detail::gpuCallocHostUnique<ProxyTrigger>(size)),
+        head(detail::gpuCallocUnique<uint64_t>()),
+        tailReplica(detail::gpuCallocUnique<uint64_t>()),
        size(size),
        hostTail(0),
        stream(cudaStreamNonBlocking) {}
--- a/src/gpu_utils.cc
+++ b/src/gpu_utils.cc
@@ -0,0 +1,165 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <cstring>
+#include <mscclpp/gpu.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+namespace mscclpp {
+
+namespace detail {
+
+/// set memory access permission to read-write
+/// @param base Base memory pointer.
+/// @param size Size of the memory.
+void setReadWriteMemoryAccess(void* base, size_t size) {
+  CUmemAccessDesc accessDesc = {};
+  int deviceId;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = deviceId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
+}
+
+void* gpuCalloc(size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  void* ptr;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, bytes));
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+  return ptr;
+}
+
+void* gpuCallocHost(size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  void* ptr;
+  MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, cudaHostAllocMapped | cudaHostAllocWriteCombined));
+  ::memset(ptr, 0, bytes);
+  return ptr;
+}
+
+#if defined(__HIP_PLATFORM_AMD__)
+void* gpuCallocUncached(size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  void* ptr;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, bytes, hipDeviceMallocUncached));
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+  return ptr;
+}
+#endif  // defined(__HIP_PLATFORM_AMD__)
+
+#if (CUDA_NVLS_SUPPORTED)
+size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
+  size_t gran = 0;
+  int numDevices = 0;
+  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
+
+  CUmulticastObjectProp prop = {};
+  prop.size = size;
+  // This is a dummy value, it might affect the granularity in the future
+  prop.numDevices = numDevices;
+  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.flags = 0;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
+  return gran;
+}
+
+void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  int deviceId = -1;
+  CUdevice currentDevice;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.requestedHandleTypes =
+      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.location.id = currentDevice;
+
+  if (gran == 0) {
+    gran = getMulticastGranularity(bytes, CU_MULTICAST_GRANULARITY_RECOMMENDED);
+  }
+
+  // allocate physical memory
+  CUmemGenericAllocationHandle memHandle;
+  size_t nbytes = (bytes + gran - 1) / gran * gran;
+  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
+
+  if (align == 0) {
+    align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
+  }
+
+  void* devicePtr = nullptr;
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, align, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
+  setReadWriteMemoryAccess(devicePtr, nbytes);
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+
+  return devicePtr;
+}
+#endif  // CUDA_NVLS_SUPPORTED
+
+void gpuFree(void* ptr) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaFree(ptr));
+}
+
+void gpuFreeHost(void* ptr) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
+}
+
+#if (CUDA_NVLS_SUPPORTED)
+void gpuFreePhysical(void* ptr) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
+  MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
+  MSCCLPP_CUTHROW(cuMemRelease(handle));
+  MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
+}
+#endif  // CUDA_NVLS_SUPPORTED
+
+void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
+}
+
+void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+}
+
+}  // namespace detail
+
+bool isNvlsSupported() {
+  [[maybe_unused]] static bool result = false;
+  [[maybe_unused]] static bool isChecked = false;
+#if (CUDA_NVLS_SUPPORTED)
+  if (!isChecked) {
+    int isMulticastSupported;
+    int isFabricSupported;
+    CUdevice dev;
+    MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
+    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
+    result = (isMulticastSupported == 1 && isFabricSupported == 1);
+  }
+  return result;
+#endif
+  return false;
+}
+
+}  // namespace mscclpp
--- a/src/npkit/npkit.cc
+++ b/src/npkit/npkit.cc
@@ -12,16 +12,16 @@

 uint64_t NpKit::rank_ = 0;

-std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
+std::vector<mscclpp::detail::UniqueGpuPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
 std::vector<std::unique_ptr<NpKitEvent[]>> NpKit::cpu_event_buffers_;

-mscclpp::UniqueCudaPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
+mscclpp::detail::UniqueGpuPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
 std::unique_ptr<NpKitEventCollectContext[]> NpKit::cpu_collect_contexts_;

 #if defined(__HIP_PLATFORM_AMD__)
-mscclpp::UniqueCudaHostPtr<uint64_t[]> NpKit::cpu_timestamp_;
+mscclpp::detail::UniqueGpuHostPtr<uint64_t[]> NpKit::cpu_timestamp_;
 #else
-mscclpp::UniqueCudaHostPtr<uint64_t> NpKit::cpu_timestamp_;
+mscclpp::detail::UniqueGpuHostPtr<uint64_t> NpKit::cpu_timestamp_;
 #endif
 std::unique_ptr<std::thread> NpKit::cpu_timestamp_update_thread_;
 volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
@@ -53,11 +53,11 @@ void NpKit::Init(int rank) {
  rank_ = rank;

  // Init event data structures
-  gpu_collect_contexts_ = mscclpp::allocUniqueCuda<NpKitEventCollectContext>(NpKit::kNumGpuEventBuffers);
+  gpu_collect_contexts_ = mscclpp::detail::gpuCallocUnique<NpKitEventCollectContext>(NpKit::kNumGpuEventBuffers);
  for (i = 0; i < NpKit::kNumGpuEventBuffers; i++) {
-    gpu_event_buffers_.emplace_back(mscclpp::allocUniqueCuda<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
+    gpu_event_buffers_.emplace_back(mscclpp::detail::gpuCallocUnique<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
    ctx.event_buffer = gpu_event_buffers_[i].get();
-    mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1);
+    mscclpp::gpuMemcpy(gpu_collect_contexts_.get() + i, &ctx, 1);
  }

  cpu_collect_contexts_ = std::make_unique<NpKitEventCollectContext[]>(NpKit::kNumCpuEventBuffers);
@@ -69,15 +69,15 @@ void NpKit::Init(int rank) {

 #if defined(__HIP_PLATFORM_AMD__)
  // Init timestamp. Allocates MAXCHANNELS*128 bytes buffer for GPU
-  cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t[]>(NPKIT_MAX_NUM_GPU_THREADBLOCKS *
-                                                           NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t));
+  cpu_timestamp_ = mscclpp::detail::gpuCallocHostUnique<uint64_t[]>(NPKIT_MAX_NUM_GPU_THREADBLOCKS *
+                                                                    NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t));
  for (int i = 0; i < NPKIT_MAX_NUM_GPU_THREADBLOCKS; i++) {
    NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(cpu_timestamp_.get(),
                                        std::chrono::system_clock::now().time_since_epoch().count(), i);
  }
 #else
  // Init timestamp
-  cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t>();
+  cpu_timestamp_ = mscclpp::detail::gpuCallocHostUnique<uint64_t>();
  volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_.get();
  *volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count();
 #endif
@@ -153,8 +153,8 @@ void NpKit::Dump(const std::string& dump_dir) {
    dump_file_path += std::to_string(rank_);
    dump_file_path += "_buf_";
    dump_file_path += std::to_string(i);
-    mscclpp::memcpyCuda(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
-    mscclpp::memcpyCuda(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
+    mscclpp::gpuMemcpy(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
+    mscclpp::gpuMemcpy(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
    auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
    gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0].get()),
                         cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
--- a/src/nvls.cc
+++ b/src/nvls.cc
@@ -58,7 +58,7 @@ NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) {
  mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-  mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_;
+  mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
  bufferSize_ = mcProp_.size;
  MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
  mcFileDesc_ = 0;
@@ -200,7 +200,7 @@ std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUdeviceptr devicePtr, si
  char* mcPtr;
  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
-  setReadWriteMemoryAccess(mcPtr, devBuffSize);
+  detail::setReadWriteMemoryAccess(mcPtr, devBuffSize);
  INFO(MSCCLPP_COLL, "NVLS connection bound memory at offset %ld, size %ld", offset, devBuffSize);

  auto deleter = [=, self = shared_from_this()](char* ptr) {
@@ -240,8 +240,6 @@ class NvlsConnection::Impl {
 };
 #endif  // !(CUDA_NVLS_SUPPORTED)

-const int NvlsConnection::DefaultNvlsBufferSize = (1 << 29);
-
 NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
    : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}

--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -244,7 +244,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
      size_t gran = getRecommendedGranularity();
      MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&base, this->size, gran, 0, 0));
      MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)base, this->size, 0, handle, 0));
-      setReadWriteMemoryAccess(base, this->size);
+      detail::setReadWriteMemoryAccess(base, this->size);
      this->data = static_cast<char*>(base) + entry.offsetFromBase;
    } else {
      MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
--- a/src/semaphore.cc
+++ b/src/semaphore.cc
@@ -19,9 +19,17 @@ static NonblockingFuture<RegisteredMemory> setupInboundSemaphoreId(Communicator&
  return communicator.recvMemoryOnSetup(remoteRank, tag);
 }

+static detail::UniqueGpuPtr<uint64_t> createGpuSemaphoreId() {
+#if defined(__HIP_PLATFORM_AMD__)
+  return detail::gpuCallocUncachedUnique<uint64_t>();
+#else   // !defined(__HIP_PLATFORM_AMD__)
+  return detail::gpuCallocUnique<uint64_t>();
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+}
+
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator,
                                                           std::shared_ptr<Connection> connection)
-    : BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), std::make_unique<uint64_t>()),
+    : BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), std::make_unique<uint64_t>()),
      connection_(connection) {
  INFO(MSCCLPP_INIT, "Creating a Host2Device semaphore for %s transport from %d to %d",
       connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
@@ -85,7 +93,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {

 MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& communicator,
                                                                   std::shared_ptr<Connection> connection)
-    : BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>()) {
+    : BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), createGpuSemaphoreId()) {
  INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d",
       connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
       communicator.remoteRankOf(*connection));
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -7,7 +7,6 @@
 #include <chrono>
 #include <iostream>
 #include <mscclpp/errors.hpp>
-#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <string>
@@ -67,22 +66,4 @@ std::string getHostName(int maxlen, const char delim) {
  return hostname.substr(0, i);
 }

-bool isNvlsSupported() {
-  [[maybe_unused]] static bool result = false;
-  [[maybe_unused]] static bool isChecked = false;
-#if (CUDA_NVLS_SUPPORTED)
-  if (!isChecked) {
-    int isMulticastSupported;
-    int isFabricSupported;
-    CUdevice dev;
-    MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
-    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
-    result = (isMulticastSupported == 1 && isFabricSupported == 1);
-  }
-  return result;
-#endif
-  return false;
-}
-
 }  // namespace mscclpp
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -129,12 +129,7 @@ int main(int argc, char* argv[]) {
  }

  mscclpp::ExecutionPlan plan(executionPlanPath);
-  std::shared_ptr<char> sendbuff;
-  if (mscclpp::isNvlsSupported()) {
-    sendbuff = mscclpp::allocSharedPhysicalCuda<char>(bufferSize);
-  } else {
-    sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
-  }
+  std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
  std::vector<int> dataHost(bufferSize / sizeof(int), rank);
  MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice));
  double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType);
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -113,7 +113,7 @@ void CommunicatorTest::SetUp() {
  }

  for (size_t n = 0; n < numBuffers; n++) {
-    devicePtr[n] = mscclpp::allocSharedCuda<int>(deviceBufferSize / sizeof(int));
+    devicePtr[n] = mscclpp::detail::gpuCallocShared<int>(deviceBufferSize / sizeof(int));
    registerMemoryPairs(devicePtr[n].get(), deviceBufferSize, mscclpp::Transport::CudaIpc | ibTransport, 0, remoteRanks,
                        localMemory[n], remoteMemory[n]);
  }
@@ -133,7 +133,7 @@ void CommunicatorTest::deviceBufferInit() {
    for (size_t i = 0; i < dataCount; i++) {
      hostBuffer[i] = gEnv->rank + n * gEnv->worldSize;
    }
-    mscclpp::memcpyCuda<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
+    mscclpp::gpuMemcpy<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
  }
 }

@@ -155,7 +155,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
  size_t dataCount = deviceBufferSize / sizeof(int);
  for (int n = 0; n < (int)devicePtr.size(); n++) {
    std::vector<int> hostBuffer(dataCount, 0);
-    mscclpp::memcpyCuda<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
+    mscclpp::gpuMemcpy<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
    for (int i = 0; i < gEnv->worldSize; i++) {
      if (((i / gEnv->nRanksPerNode) == (gEnv->rank / gEnv->nRanksPerNode)) && skipLocal) {
        continue;
@@ -214,12 +214,13 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
  deviceBufferInit();
  communicator->bootstrap()->barrier();

-  auto deviceSemaphoreHandles = mscclpp::allocSharedCuda<mscclpp::Host2DeviceSemaphore::DeviceHandle>(gEnv->worldSize);
+  auto deviceSemaphoreHandles =
+      mscclpp::detail::gpuCallocShared<mscclpp::Host2DeviceSemaphore::DeviceHandle>(gEnv->worldSize);
  for (int i = 0; i < gEnv->worldSize; i++) {
    if (i != gEnv->rank) {
      mscclpp::Host2DeviceSemaphore::DeviceHandle deviceHandle = semaphores[i]->deviceHandle();
-      mscclpp::memcpyCuda<mscclpp::Host2DeviceSemaphore::DeviceHandle>(deviceSemaphoreHandles.get() + i, &deviceHandle,
-                                                                       1, cudaMemcpyHostToDevice);
+      mscclpp::gpuMemcpy<mscclpp::Host2DeviceSemaphore::DeviceHandle>(deviceSemaphoreHandles.get() + i, &deviceHandle,
+                                                                      1, cudaMemcpyHostToDevice);
    }
  }
  communicator->bootstrap()->barrier();
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -57,7 +57,7 @@ TEST_F(ExecutorTest, TwoNodesAllreduce) {
      path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
  mscclpp::ExecutionPlan plan(executionFilesPath.string());
  const int bufferSize = 1024 * 1024;
-  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
+  std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
  mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
  executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16,
                    plan, stream);
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -82,7 +82,7 @@ TEST_F(IbPeerToPeerTest, SimpleSendRecv) {

  const int maxIter = 100000;
  const int nelem = 1;
-  auto data = mscclpp::allocUniqueCuda<int>(nelem);
+  auto data = mscclpp::detail::gpuCallocUnique<int>(nelem);

  registerBufferAndConnect(data.get(), sizeof(int) * nelem);

@@ -196,7 +196,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
  const uint64_t signalPeriod = 1024;
  const uint64_t maxIter = 10000;
  const uint64_t nelem = 65536 + 1;
-  auto data = mscclpp::allocUniqueCuda<uint64_t>(nelem);
+  auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);

  registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);

@@ -205,12 +205,14 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {

  if (gEnv->rank == 0) {
    // Receiver
-    auto curIter = mscclpp::makeUniqueCudaHost<uint64_t>(0);
-    auto result = mscclpp::makeUniqueCudaHost<int>(0);
+    auto curIter = mscclpp::detail::gpuCallocHostUnique<uint64_t>();
+    auto result = mscclpp::detail::gpuCallocHostUnique<int>();

    volatile uint64_t* ptrCurIter = (volatile uint64_t*)curIter.get();
    volatile int* ptrResult = (volatile int*)result.get();

+    ASSERT_NE(ptrCurIter, nullptr);
+    ASSERT_NE(ptrResult, nullptr);
    ASSERT_EQ(*ptrCurIter, 0);
    ASSERT_EQ(*ptrResult, 0);

@@ -246,7 +248,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
      for (uint64_t i = 0; i < nelem; i++) {
        hostBuffer[i] = iter;
      }
-      mscclpp::memcpyCuda<uint64_t>(data.get(), hostBuffer.data(), nelem, cudaMemcpyHostToDevice);
+      mscclpp::gpuMemcpy<uint64_t>(data.get(), hostBuffer.data(), nelem, cudaMemcpyHostToDevice);

      // Need to signal from time to time to empty the IB send queue
      bool signaled = (iter % signalPeriod == 0);
@@ -303,7 +305,7 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {

  const int maxIter = 100000;
  const int nelem = 1;
-  auto data = mscclpp::allocUniqueCuda<int>(nelem);
+  auto data = mscclpp::detail::gpuCallocUnique<int>(nelem);

  registerBufferAndConnect(data.get(), sizeof(int) * nelem);

--- a/test/mp_unit/proxy_channel_tests.cu
+++ b/test/mp_unit/proxy_channel_tests.cu
@@ -157,7 +157,7 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
  const int nElem = 4 * 1024 * 1024;

  std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
  setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));

  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
@@ -169,7 +169,7 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {

  proxyService->startProxy();

-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  const int nTries = 1000;

@@ -202,7 +202,7 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
  const int nElem = 4 * 1024 * 1024;

  std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
  setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));

  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
@@ -214,7 +214,7 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {

  proxyService->startProxy();

-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
@@ -344,11 +344,11 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
  const int nElem = 4 * 1024 * 1024;

  std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();

  const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
+  auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();

  setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                       nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
@@ -368,7 +368,7 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {

  proxyService->startProxy();

-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  const int nTries = 1000;

@@ -411,11 +411,11 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
  const int nElem = 4 * 1024 * 1024;

  std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();

  const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
+  auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();

  setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                       nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
--- a/test/mp_unit/sm_channel_tests.cu
+++ b/test/mp_unit/sm_channel_tests.cu
@@ -77,8 +77,8 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
  const int defaultNTries = 1000;

  std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
-  std::shared_ptr<int> intermBuff = mscclpp::allocExtSharedCuda<int>(nElem * 2);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
+  std::shared_ptr<int> intermBuff = mscclpp::GpuBuffer<int>(nElem * 2).memory();
  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -88,7 +88,7 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));

-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  // The least nelem is 2 for packet ping pong
  kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
@@ -178,7 +178,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
  const int nElem = 4 * 1024 * 1024;

  std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -188,7 +188,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));

-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
@@ -257,7 +257,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
  const int nElem = 4 * 1024 * 1024;

  std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -267,7 +267,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));

-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.

 #include <algorithm>
+#include <cstring>
 #include <mscclpp/concurrency_device.hpp>
 #include <string>

@@ -711,13 +712,13 @@ class AllGatherTestEngine : public BaseTestEngine {
 AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}

 void AllGatherTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
  expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
  if (args_.kernelNum == 7) {
    const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
    // 2x for double-buffering, scratchBuff used to store original data and reduced results
    const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
-    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
  }
 }

--- a/test/mscclpp-test/allreduce_test.cu
+++ b/test/mscclpp-test/allreduce_test.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.

 #include <algorithm>
+#include <cstring>
 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/packet_device.hpp>
 #include <vector>
@@ -1272,30 +1273,30 @@ bool AllReduceTestEngine::isInPlace() const {
 }

 void AllReduceTestEngine::allocateBuffer() {
-  inputBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
-  resultBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  inputBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
+  resultBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
  inputBuff = inputBuff_.get();
  resultBuff = resultBuff_.get();

  if (args_.kernelNum == 0 || args_.kernelNum == 1 || args_.kernelNum == 3 || args_.kernelNum == 4) {
-    scratchBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+    scratchBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
    scratchBuff = scratchBuff_.get();
  } else if (args_.kernelNum == 2) {
    const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
    // 2x for double-buffering
    const size_t scratchBuffNelem = nPacket * std::max(args_.nRanksPerNode - 1, 1) * 2;
-    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
    scratchPacketBuff = scratchPacketBuff_.get();
    const size_t packetBuffNelem = nPacket * 2;
-    putPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
-    getPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
+    putPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(packetBuffNelem).memory();
+    getPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(packetBuffNelem).memory();
    putPacketBuff = putPacketBuff_.get();
    getPacketBuff = getPacketBuff_.get();
  } else if (args_.kernelNum == 6 || args_.kernelNum == 7) {
    const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
    // 2x for double-buffering, scratchBuff used to store original data and reduced results
    const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
-    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
    scratchPacketBuff = scratchPacketBuff_.get();
  }

--- a/test/mscclpp-test/alltoall_test.cu
+++ b/test/mscclpp-test/alltoall_test.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.

 #include <cstdint>
+#include <cstring>
 #include <mscclpp/concurrency_device.hpp>

 #include "common.hpp"
@@ -139,8 +140,8 @@ class AllToAllTestEngine : public BaseTestEngine {
 AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoall") { inPlace_ = false; }

 void AllToAllTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
-  recvBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
+  recvBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
  expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);

  localSendBuff = sendBuff_.get();
--- a/test/mscclpp-test/sendrecv_test.cu
+++ b/test/mscclpp-test/sendrecv_test.cu
@@ -137,8 +137,8 @@ class SendRecvTestEngine : public BaseTestEngine {
 SendRecvTestEngine::SendRecvTestEngine(const TestArgs& args) : BaseTestEngine(args, "sendrecv") { inPlace_ = false; }

 void SendRecvTestEngine::allocateBuffer() {
-  std::shared_ptr<int> sendBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
-  std::shared_ptr<int> recvBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> sendBuff = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
+  std::shared_ptr<int> recvBuff = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
  devicePtrs_.push_back(sendBuff);
  devicePtrs_.push_back(recvBuff);

--- a/test/unit/cuda_utils_tests.cc
+++ b/test/unit/cuda_utils_tests.cc
@@ -6,23 +6,23 @@
 #include <mscclpp/gpu_utils.hpp>

 TEST(CudaUtilsTest, AllocShared) {
-  auto p1 = mscclpp::allocSharedCuda<uint32_t>();
-  auto p2 = mscclpp::allocSharedCuda<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocShared<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocShared<int64_t>(5);
 }

 TEST(CudaUtilsTest, AllocUnique) {
-  auto p1 = mscclpp::allocUniqueCuda<uint32_t>();
-  auto p2 = mscclpp::allocUniqueCuda<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocUnique<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocUnique<int64_t>(5);
 }

 TEST(CudaUtilsTest, MakeSharedHost) {
-  auto p1 = mscclpp::makeSharedCudaHost<uint32_t>();
-  auto p2 = mscclpp::makeSharedCudaHost<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocHostShared<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocHostShared<int64_t>(5);
 }

 TEST(CudaUtilsTest, MakeUniqueHost) {
-  auto p1 = mscclpp::makeUniqueCudaHost<uint32_t>();
-  auto p2 = mscclpp::makeUniqueCudaHost<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocHostUnique<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocHostUnique<int64_t>(5);
 }

 TEST(CudaUtilsTest, Memcpy) {
@@ -32,9 +32,9 @@ TEST(CudaUtilsTest, Memcpy) {
    hostBuff[i] = i + 1;
  }
  std::vector<int> hostBuffTmp(nElem, 0);
-  auto devBuff = mscclpp::allocSharedCuda<int>(nElem);
-  mscclpp::memcpyCuda<int>(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice);
-  mscclpp::memcpyCuda<int>(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost);
+  auto devBuff = mscclpp::detail::gpuCallocShared<int>(nElem);
+  mscclpp::gpuMemcpy<int>(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice);
+  mscclpp::gpuMemcpy<int>(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost);

  for (int i = 0; i < nElem; ++i) {
    EXPECT_EQ(hostBuff[i], hostBuffTmp[i]);