Add GpuBuffer class (#423)

* Renamed and moved mem alloc functions into the `mscclpp::detail::`
namespace (now `mscclpp::detail::gpuCalloc*<T>()`)
* Deprecated constructor-calling mem alloc functions
(`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`)
* Added a new `mscclpp::GpuBuffer<T>()` class that should be used in
general for allocating communication buffers
* Added a new `mscclpp.utils.GpuBuffer` Python class that inherits
`cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc`
* Renamed `mscclpp::memcpyCuda*<T>()` functions into
`mscclpp::gpuMemcpy*<T>()` for name consistency
* A few fixes in NVLS memory allocation
* Tackled minor compiler warnings
This commit is contained in:
Changho Hwang
2025-01-07 18:40:01 -08:00
committed by GitHub
parent 6d26b92665
commit 34945fb107
38 changed files with 527 additions and 555 deletions

View File

@@ -155,10 +155,10 @@ struct Executor::Impl {
plan.impl_->lightLoadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
allocExtSharedCuda<char>(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
memcpyCuda(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
(char*)devicePlans[devicePlanKey].data(),
devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
GpuBuffer(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
gpuMemcpy(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
(char*)devicePlans[devicePlanKey].data(),
devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
this->contexts[key].currentDevicePlan = devicePlanKey;
return this->contexts[key];
}
@@ -170,12 +170,7 @@ struct Executor::Impl {
size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
size_t scratchBufferSize =
std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange, recvMemRange), maxScratchBufferSize);
std::shared_ptr<char> scratchBuffer;
if (isNvlsSupported()) {
scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
} else {
scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
}
std::shared_ptr<char> scratchBuffer = GpuBuffer(scratchBufferSize).memory();
context.scratchBuffer = scratchBuffer;
context.scratchBufferSize = scratchBufferSize;
context.proxyService = std::make_shared<ProxyService>();
@@ -186,11 +181,10 @@ struct Executor::Impl {
this->setupNvlsChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
context.deviceExecutionPlansBuffers[devicePlanKey] =
allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
memcpyCuda(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
(char*)context.deviceExecutionPlans[devicePlanKey].data(),
context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan),
cudaMemcpyHostToDevice);
GpuBuffer(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
gpuMemcpy(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
(char*)context.deviceExecutionPlans[devicePlanKey].data(),
context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
context.currentDevicePlan = devicePlanKey;
context.proxyService->startProxy();
this->contexts.insert({key, context});