Use GpuIpcMem for NVLS connections (#719)

* Now `NvlsConnection` internally reuses `GpuIpcMem` for multicast memory handling. * Removed unnecessary barriers from `connectNvlsCollective()` (CUDA API handles this automatically). * Updated `GpuIpcMem::map()` and `GpuIpcMem::mapMulticast()` to return a shared pointer with custom deleter for unmapping, which prevents misuse of raw pointers and reduces states to be stored in the `GpuIpcMem` instance. * Now for `RuntimeIpc` type handles, for consistency with other types, `cudaIpcOpenMemHandle` will be called in `GpuIpcMem::map()` instead of the ctor of `GpuIpcMem`. --------- Co-authored-by: Binyang Li <binyli@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com>
2026-04-19 22:39:11 +00:00 · 2026-01-14 21:16:04 -08:00
parent c2a87302bd
commit 105239fc6c
7 changed files with 226 additions and 290 deletions
--- a/python/csrc/switch_channel_py.cpp
+++ b/python/csrc/switch_channel_py.cpp
@@ -29,8 +29,7 @@ void register_nvls(nb::module_& m) {
      });

  nb::class_<NvlsConnection>(m, "NvlsConnection")
-      .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size"))
-      .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
+      .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size"));

  m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("all_ranks"),
        nb::arg("buffer_size"));