FIFO improvements (#557)

* Revert `MSCCLPP_FIFO_USE_TAIL_REPLICA=1` back to the default.
* Optimize `FifoDeviceHandle`.
* Do not use `cudaHostAllocWriteCombined` that increases latency.
* Pin host memory for `Host2DeviceSemaphore::outboundSemaphore_`.
* Fix proxy NUMA binding issues.
* Prevent graph capture inside proxy threads.
* Now `CudaIpcConnection` skips stream sync when unnecessary.
* Now any type of connection needs to hold a shared pointer to the
context for memory safety.
* Now a context should be always managed by a shared pointer for memory
safety.
* Minor docs & interface improvements.
* Minor fix in `mscclpp-test` correctness test.
This commit is contained in:
Changho Hwang
2025-06-24 09:50:28 -07:00
committed by GitHub
parent 2796cfa5ba
commit b4dde38db8
28 changed files with 384 additions and 353 deletions

View File

@@ -36,18 +36,12 @@ class MyProxyService {
connections_(conns),
allRegMem_(allRegMem),
semaphores_(semaphores),
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {
int cudaDevice;
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice);
}
void bindThread() {
if (deviceNumaNode_ >= 0) {
mscclpp::numaBind(deviceNumaNode_);
}
}
mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger) {
int dataSizePerRank = dataSize_ / nranks_;
for (int r = 1; r < nranks_; ++r) {
@@ -64,7 +58,7 @@ class MyProxyService {
void stop() { proxy_.stop(); }
mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo().deviceHandle(); }
mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo()->deviceHandle(); }
};
void init_mscclpp_proxy_test_module(nb::module_ &m) {