mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-04 21:51:32 +00:00
Add GpuBuffer class (#423)
* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
This commit is contained in:
@@ -155,10 +155,10 @@ struct Executor::Impl {
|
||||
plan.impl_->lightLoadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
|
||||
this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
|
||||
this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
|
||||
allocExtSharedCuda<char>(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
|
||||
memcpyCuda(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)devicePlans[devicePlanKey].data(),
|
||||
devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
|
||||
GpuBuffer(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
|
||||
gpuMemcpy(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)devicePlans[devicePlanKey].data(),
|
||||
devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
|
||||
this->contexts[key].currentDevicePlan = devicePlanKey;
|
||||
return this->contexts[key];
|
||||
}
|
||||
@@ -170,12 +170,7 @@ struct Executor::Impl {
|
||||
size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
|
||||
size_t scratchBufferSize =
|
||||
std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange, recvMemRange), maxScratchBufferSize);
|
||||
std::shared_ptr<char> scratchBuffer;
|
||||
if (isNvlsSupported()) {
|
||||
scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
|
||||
} else {
|
||||
scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
|
||||
}
|
||||
std::shared_ptr<char> scratchBuffer = GpuBuffer(scratchBufferSize).memory();
|
||||
context.scratchBuffer = scratchBuffer;
|
||||
context.scratchBufferSize = scratchBufferSize;
|
||||
context.proxyService = std::make_shared<ProxyService>();
|
||||
@@ -186,11 +181,10 @@ struct Executor::Impl {
|
||||
this->setupNvlsChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
|
||||
this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
|
||||
context.deviceExecutionPlansBuffers[devicePlanKey] =
|
||||
allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
|
||||
memcpyCuda(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)context.deviceExecutionPlans[devicePlanKey].data(),
|
||||
context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan),
|
||||
cudaMemcpyHostToDevice);
|
||||
GpuBuffer(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
|
||||
gpuMemcpy(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)context.deviceExecutionPlans[devicePlanKey].data(),
|
||||
context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
|
||||
context.currentDevicePlan = devicePlanKey;
|
||||
context.proxyService->startProxy();
|
||||
this->contexts.insert({key, context});
|
||||
|
||||
Reference in New Issue
Block a user