Use GpuIpcMem for NVLS connections (#719)

* Now `NvlsConnection` internally reuses `GpuIpcMem` for multicast
memory handling.
* Removed unnecessary barriers from `connectNvlsCollective()` (CUDA API
handles this automatically).
* Updated `GpuIpcMem::map()` and `GpuIpcMem::mapMulticast()` to return a
shared pointer with custom deleter for unmapping, which prevents misuse
of raw pointers and reduces states to be stored in the `GpuIpcMem`
instance.
* Now for `RuntimeIpc` type handles, for consistency with other types,
`cudaIpcOpenMemHandle` will be called in `GpuIpcMem::map()` instead of
the ctor of `GpuIpcMem`.

---------

Co-authored-by: Binyang Li <binyli@microsoft.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com>
This commit is contained in:
Changho Hwang
2026-01-14 21:16:04 -08:00
committed by GitHub
parent c2a87302bd
commit 105239fc6c
7 changed files with 226 additions and 290 deletions

View File

@@ -14,12 +14,12 @@ class NvlsConnection;
struct SwitchChannel {
private:
void* devicePtr_;
std::shared_ptr<char> mcPtr_;
std::shared_ptr<void> mcPtr_;
size_t bufferSize_;
public:
using DeviceHandle = SwitchChannelDeviceHandle;
SwitchChannel(void* devicePtr, std::shared_ptr<char> mcPtr, size_t bufferSize)
SwitchChannel(void* devicePtr, std::shared_ptr<void> mcPtr, size_t bufferSize)
: devicePtr_(devicePtr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
DeviceHandle deviceHandle() const;
void* getDevicePtr();
@@ -34,10 +34,6 @@ class NvlsConnection {
NvlsConnection() = delete;
std::vector<char> serialize();
// Everyone needs to synchronize after creating a NVLS connection before adding devices
void addDevice();
void addDevice(int cudaDeviceId);
/// Bind the memory allocated via mscclpp::GpuBuffer to the multicast handle. The behavior
/// is undefined if the devicePtr is not allocated by mscclpp::GpuBuffer.
/// @param devicePtr The device pointer returned by `mscclpp::GpuBuffer::data()`.
@@ -45,8 +41,6 @@ class NvlsConnection {
/// @return SwitchChannel with devicePtr, mcPtr and bufferSize
SwitchChannel bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);
size_t getMultiCastMinGranularity();
private:
class Impl;
std::shared_ptr<Impl> pimpl_;