mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 09:17:06 +00:00
97 lines
2.6 KiB
C++
97 lines
2.6 KiB
C++
#include "mscclpp.hpp"
|
|
#include "utils.h"
|
|
#include "api.h"
|
|
#include <thread>
|
|
#include <atomic>
|
|
|
|
namespace mscclpp {
|
|
|
|
const int ProxyStopCheckPeriod = 1000;
|
|
|
|
const int ProxyFlushPeriod = 4;
|
|
|
|
struct Proxy::Impl {
|
|
ProxyHandler handler;
|
|
HostProxyFifo fifo;
|
|
std::thread service;
|
|
std::atomic_bool running;
|
|
|
|
Impl(ProxyHandler handler) : handler(handler), running(false) {}
|
|
};
|
|
|
|
MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) {
|
|
pimpl = std::make_unique<Impl>(handler);
|
|
}
|
|
|
|
MSCCLPP_API_CPP Proxy::~Proxy() {
|
|
if (pimpl) {
|
|
stop();
|
|
}
|
|
}
|
|
|
|
MSCCLPP_API_CPP void Proxy::start() {
|
|
pimpl->running = true;
|
|
pimpl->service = std::thread([this] {
|
|
// from this point on, proxy thread will stay close to the device
|
|
// PROXYMSCCLPPCHECK(numaBind(pimpl->comm->devNumaNode)); // TODO: reenable this
|
|
|
|
ProxyHandler handler = this->pimpl->handler;
|
|
HostProxyFifo& fifo = this->pimpl->fifo;
|
|
std::atomic_bool& running = this->pimpl->running;
|
|
ProxyTrigger trigger;
|
|
|
|
int runCnt = ProxyStopCheckPeriod;
|
|
uint64_t flushCnt = 0;
|
|
for (;;) {
|
|
if (runCnt-- == 0) {
|
|
runCnt = ProxyStopCheckPeriod;
|
|
if (!running) {
|
|
break;
|
|
}
|
|
}
|
|
// Poll to see if we are ready to send anything
|
|
fifo.poll(&trigger);
|
|
if (trigger.fst == 0) { // TODO: this check is a potential pitfall for custom triggers
|
|
continue; // there is one in progress
|
|
}
|
|
|
|
ProxyHandlerResult result = handler(trigger);
|
|
|
|
// Send completion: reset only the high 64 bits
|
|
fifo.pop();
|
|
// Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure
|
|
// that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush
|
|
// request.
|
|
if ((++flushCnt % ProxyFlushPeriod) == 0 || result == ProxyHandlerResult::FlushFifoTailAndContinue) {
|
|
// TODO: relocate this check: || (trigger.fields.type & mscclppSync)
|
|
fifo.flushTail();
|
|
}
|
|
|
|
if (result == ProxyHandlerResult::Stop) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// make sure the tail is flushed before we shut the proxy
|
|
fifo.flushTail(/*sync=*/true);
|
|
// TODO: do these need to run?
|
|
// bool isP2pProxy = (proxyState->ibContext == nullptr);
|
|
// if (isP2pProxy) {
|
|
// cudaStream_t p2pStream = proxyState->p2pStream;
|
|
// PROXYCUDACHECK(cudaStreamSynchronize(p2pStream));
|
|
// }
|
|
});
|
|
}
|
|
|
|
MSCCLPP_API_CPP void Proxy::stop() {
|
|
pimpl->running = false;
|
|
if (pimpl->service.joinable()) {
|
|
pimpl->service.join();
|
|
}
|
|
}
|
|
|
|
MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() {
|
|
return pimpl->fifo;
|
|
}
|
|
|
|
} // namespace mscclpp
|