Add GpuBuffer class (#423)

* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
2026-05-04 21:51:32 +00:00 · 2025-01-07 18:40:01 -08:00
parent 6d26b92665
commit 34945fb107
38 changed files with 527 additions and 555 deletions
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -155,10 +155,10 @@ struct Executor::Impl {
      plan.impl_->lightLoadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
      this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
      this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
-          allocExtSharedCuda<char>(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
-      memcpyCuda(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
-                 (char*)devicePlans[devicePlanKey].data(),
-                 devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
+          GpuBuffer(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
+      gpuMemcpy(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
+                (char*)devicePlans[devicePlanKey].data(),
+                devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
      this->contexts[key].currentDevicePlan = devicePlanKey;
      return this->contexts[key];
    }
@@ -170,12 +170,7 @@ struct Executor::Impl {
    size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
    size_t scratchBufferSize =
        std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange, recvMemRange), maxScratchBufferSize);
-    std::shared_ptr<char> scratchBuffer;
-    if (isNvlsSupported()) {
-      scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
-    } else {
-      scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
-    }
+    std::shared_ptr<char> scratchBuffer = GpuBuffer(scratchBufferSize).memory();
    context.scratchBuffer = scratchBuffer;
    context.scratchBufferSize = scratchBufferSize;
    context.proxyService = std::make_shared<ProxyService>();
@@ -186,11 +181,10 @@ struct Executor::Impl {
    this->setupNvlsChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
    this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
    context.deviceExecutionPlansBuffers[devicePlanKey] =
-        allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
-    memcpyCuda(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
-               (char*)context.deviceExecutionPlans[devicePlanKey].data(),
-               context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan),
-               cudaMemcpyHostToDevice);
+        GpuBuffer(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
+    gpuMemcpy(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
+              (char*)context.deviceExecutionPlans[devicePlanKey].data(),
+              context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
    context.currentDevicePlan = devicePlanKey;
    context.proxyService->startProxy();
    this->contexts.insert({key, context});