mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-03-29 03:27:47 +00:00
FIFO improvements (#557)
* Revert `MSCCLPP_FIFO_USE_TAIL_REPLICA=1` back to the default. * Optimize `FifoDeviceHandle`. * Do not use `cudaHostAllocWriteCombined` that increases latency. * Pin host memory for `Host2DeviceSemaphore::outboundSemaphore_`. * Fix proxy NUMA binding issues. * Prevent graph capture inside proxy threads. * Now `CudaIpcConnection` skips stream sync when unnecessary. * Now any type of connection needs to hold a shared pointer to the context for memory safety. * Now a context should be always managed by a shared pointer for memory safety. * Minor docs & interface improvements. * Minor fix in `mscclpp-test` correctness test.
This commit is contained in:
@@ -36,18 +36,12 @@ class MyProxyService {
|
||||
connections_(conns),
|
||||
allRegMem_(allRegMem),
|
||||
semaphores_(semaphores),
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {
|
||||
int cudaDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice);
|
||||
}
|
||||
|
||||
void bindThread() {
|
||||
if (deviceNumaNode_ >= 0) {
|
||||
mscclpp::numaBind(deviceNumaNode_);
|
||||
}
|
||||
}
|
||||
|
||||
mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger) {
|
||||
int dataSizePerRank = dataSize_ / nranks_;
|
||||
for (int r = 1; r < nranks_; ++r) {
|
||||
@@ -64,7 +58,7 @@ class MyProxyService {
|
||||
|
||||
void stop() { proxy_.stop(); }
|
||||
|
||||
mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo().deviceHandle(); }
|
||||
mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo()->deviceHandle(); }
|
||||
};
|
||||
|
||||
void init_mscclpp_proxy_test_module(nb::module_ &m) {
|
||||
|
||||
Reference in New Issue
Block a user