FIFO improvements (#557)

* Revert `MSCCLPP_FIFO_USE_TAIL_REPLICA=1` back to the default.
* Optimize `FifoDeviceHandle`.
* Do not use `cudaHostAllocWriteCombined` that increases latency.
* Pin host memory for `Host2DeviceSemaphore::outboundSemaphore_`.
* Fix proxy NUMA binding issues.
* Prevent graph capture inside proxy threads.
* Now `CudaIpcConnection` skips stream sync when unnecessary.
* Now any type of connection needs to hold a shared pointer to the
context for memory safety.
* Now a context should be always managed by a shared pointer for memory
safety.
* Minor docs & interface improvements.
* Minor fix in `mscclpp-test` correctness test.
This commit is contained in:
Changho Hwang
2025-06-24 09:50:28 -07:00
committed by GitHub
parent 2796cfa5ba
commit b4dde38db8
28 changed files with 384 additions and 353 deletions

View File

@@ -148,7 +148,7 @@ void register_core(nb::module_& m) {
.def_rw("ib_max_wr_per_send", &EndpointConfig::ibMaxWrPerSend);
nb::class_<Context>(m, "Context")
.def(nb::init<>())
.def_static("create", &Context::create)
.def(
"register_memory",
[](Communicator* self, uintptr_t ptr, size_t size, TransportFlags transports) {

View File

@@ -16,7 +16,7 @@ void register_port_channel(nb::module_& m) {
.def("stop_proxy", &BaseProxyService::stopProxy);
nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
.def(nb::init<size_t>(), nb::arg("fifoSize") = DEFAULT_FIFO_SIZE)
.def(nb::init<int>(), nb::arg("fifoSize") = DEFAULT_FIFO_SIZE)
.def("start_proxy", &ProxyService::startProxy)
.def("stop_proxy", &ProxyService::stopProxy)
.def("build_and_add_semaphore", &ProxyService::buildAndAddSemaphore, nb::arg("comm"), nb::arg("connection"))