mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-06-08 23:38:30 +00:00
Co-authored-by: Pratyush Patel <pratyushpatel.1995@gmail.com> Co-authored-by: Esha Choukse <eschouks@microsoft.com> Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
80 lines
2.7 KiB
C++
80 lines
2.7 KiB
C++
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#include <cuda/atomic>
|
|
#include <mscclpp/cuda_utils.hpp>
|
|
#include <mscclpp/fifo.hpp>
|
|
|
|
#include "api.h"
|
|
|
|
namespace mscclpp {
|
|
|
|
struct Fifo::Impl {
|
|
UniqueCudaHostPtr<ProxyTrigger[]> triggers;
|
|
UniqueCudaPtr<uint64_t> head;
|
|
UniqueCudaPtr<uint64_t> tailReplica;
|
|
const int size;
|
|
|
|
// allocated on the host. Only accessed by the host. This is a copy of the
|
|
// value pointed to by fifoTailDev and the invariant is that
|
|
// *fifoTailDev <= hostTail. Meaning that host's copy of tail is
|
|
// always ahead of the device's copy and host updates the device's copy
|
|
// only when it is needed. Therefore, hostTail is the "true" tail
|
|
// and fifoTailDev is a "stale" tail. See proxy.cc to undertand how
|
|
// these updates are pushed to the device.
|
|
uint64_t hostTail;
|
|
|
|
// for transferring fifo tail
|
|
CudaStreamWithFlags stream;
|
|
|
|
Impl(int size)
|
|
: triggers(makeUniqueCudaHost<ProxyTrigger[]>(size)),
|
|
head(allocUniqueCuda<uint64_t>()),
|
|
tailReplica(allocUniqueCuda<uint64_t>()),
|
|
size(size),
|
|
hostTail(0),
|
|
stream(cudaStreamNonBlocking) {}
|
|
};
|
|
|
|
MSCCLPP_API_CPP Fifo::Fifo(int size) : pimpl(std::make_unique<Impl>(size)) {}
|
|
MSCCLPP_API_CPP Fifo::~Fifo() = default;
|
|
|
|
MSCCLPP_API_CPP ProxyTrigger Fifo::poll() {
|
|
ProxyTrigger trigger;
|
|
ProxyTrigger* ptr = &pimpl->triggers.get()[pimpl->hostTail % pimpl->size];
|
|
// we are loading fst first. if fst is non-zero then snd is also valid
|
|
trigger.fst = cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{ptr->fst}.load(cuda::memory_order_acquire);
|
|
trigger.snd = ptr->snd;
|
|
return trigger;
|
|
}
|
|
|
|
MSCCLPP_API_CPP void Fifo::pop() {
|
|
cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{pimpl->triggers.get()[pimpl->hostTail % pimpl->size].fst}.store(
|
|
0, cuda::memory_order_release);
|
|
|
|
(pimpl->hostTail)++;
|
|
}
|
|
|
|
MSCCLPP_API_CPP void Fifo::flushTail(bool sync) {
|
|
// Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure that the fifo can
|
|
// make progress even if there is no request mscclppSync. However, mscclppSync type is for flush request.
|
|
MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t),
|
|
cudaMemcpyHostToDevice, pimpl->stream));
|
|
if (sync) {
|
|
MSCCLPP_CUDATHROW(cudaStreamSynchronize(pimpl->stream));
|
|
}
|
|
}
|
|
|
|
MSCCLPP_API_CPP int Fifo::size() const { return pimpl->size; }
|
|
|
|
MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() {
|
|
FifoDeviceHandle deviceHandle;
|
|
deviceHandle.triggers = pimpl->triggers.get();
|
|
deviceHandle.head = pimpl->head.get();
|
|
deviceHandle.tailReplica = pimpl->tailReplica.get();
|
|
deviceHandle.size = pimpl->size;
|
|
return deviceHandle;
|
|
}
|
|
|
|
} // namespace mscclpp
|