FIFO improvements (#557)

* Revert `MSCCLPP_FIFO_USE_TAIL_REPLICA=1` back to the default. * Optimize `FifoDeviceHandle`. * Do not use `cudaHostAllocWriteCombined` that increases latency. * Pin host memory for `Host2DeviceSemaphore::outboundSemaphore_`. * Fix proxy NUMA binding issues. * Prevent graph capture inside proxy threads. * Now `CudaIpcConnection` skips stream sync when unnecessary. * Now any type of connection needs to hold a shared pointer to the context for memory safety. * Now a context should be always managed by a shared pointer for memory safety. * Minor docs & interface improvements. * Minor fix in `mscclpp-test` correctness test.
2026-06-29 10:57:27 +00:00 · 2025-06-24 09:50:28 -07:00
parent 2796cfa5ba
commit b4dde38db8
28 changed files with 384 additions and 353 deletions
--- a/python/test/_cpp/proxy_test.cpp
+++ b/python/test/_cpp/proxy_test.cpp
@@ -36,18 +36,12 @@ class MyProxyService {
        connections_(conns),
        allRegMem_(allRegMem),
        semaphores_(semaphores),
-        proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
+        proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {
    int cudaDevice;
    MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
    deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice);
  }

-  void bindThread() {
-    if (deviceNumaNode_ >= 0) {
-      mscclpp::numaBind(deviceNumaNode_);
-    }
-  }
-
  mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger) {
    int dataSizePerRank = dataSize_ / nranks_;
    for (int r = 1; r < nranks_; ++r) {
@@ -64,7 +58,7 @@ class MyProxyService {

  void stop() { proxy_.stop(); }

-  mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo().deviceHandle(); }
+  mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo()->deviceHandle(); }
 };

 void init_mscclpp_proxy_test_module(nb::module_ &m) {