mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
Reorganize current native algorithm implementation and DSL algorithm implementation. Provide unified API for DSL algo and native algo and provide interface to tune the algo Provide interface for pytorch integration with native API and DSL --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
69 lines
1.9 KiB
C++
69 lines
1.9 KiB
C++
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#include <mscclpp/env.hpp>
|
|
#include <mscclpp/fifo.hpp>
|
|
#include <mscclpp/gpu_utils.hpp>
|
|
#include <mscclpp/numa.hpp>
|
|
|
|
#include "api.h"
|
|
#include "atomic.hpp"
|
|
|
|
namespace mscclpp {
|
|
|
|
struct Fifo::Impl {
|
|
detail::UniqueGpuHostPtr<ProxyTrigger> triggers;
|
|
detail::UniqueGpuPtr<uint64_t> head;
|
|
detail::UniqueGpuHostPtr<uint64_t> tail;
|
|
detail::UniqueGpuPtr<uint64_t> tailCache;
|
|
const int size;
|
|
|
|
Impl(int size)
|
|
: triggers(detail::gpuCallocHostUnique<ProxyTrigger>(size)),
|
|
head(detail::gpuCallocUnique<uint64_t>()),
|
|
tail(detail::gpuCallocHostUnique<uint64_t>()),
|
|
tailCache(detail::gpuCallocUnique<uint64_t>()),
|
|
size(size) {}
|
|
};
|
|
|
|
MSCCLPP_API_CPP Fifo::Fifo(int size) {
|
|
int device;
|
|
MSCCLPP_CUDATHROW(cudaGetDevice(&device));
|
|
int numaNode = getDeviceNumaNode(device);
|
|
if (numaNode >= 0) {
|
|
numaBind(numaNode);
|
|
}
|
|
pimpl_ = std::make_unique<Impl>(size);
|
|
}
|
|
|
|
MSCCLPP_API_CPP Fifo::~Fifo() = default;
|
|
|
|
MSCCLPP_API_CPP ProxyTrigger Fifo::poll() {
|
|
ProxyTrigger trigger;
|
|
ProxyTrigger* ptr = &pimpl_->triggers.get()[*(pimpl_->tail) % pimpl_->size];
|
|
// we are loading fst first. if fst is non-zero then snd is also valid
|
|
trigger.fst = atomicLoad(&(ptr->fst), memoryOrderAcquire);
|
|
trigger.snd = ptr->snd;
|
|
return trigger;
|
|
}
|
|
|
|
MSCCLPP_API_CPP void Fifo::pop() {
|
|
uint64_t curTail = *(pimpl_->tail);
|
|
pimpl_->triggers.get()[curTail % pimpl_->size].fst = 0;
|
|
atomicStore(pimpl_->tail.get(), curTail + 1, memoryOrderRelease);
|
|
}
|
|
|
|
MSCCLPP_API_CPP int Fifo::size() const { return pimpl_->size; }
|
|
|
|
MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() const {
|
|
FifoDeviceHandle deviceHandle;
|
|
deviceHandle.triggers = pimpl_->triggers.get();
|
|
deviceHandle.head = pimpl_->head.get();
|
|
deviceHandle.tail = pimpl_->tail.get();
|
|
deviceHandle.tailCache = pimpl_->tailCache.get();
|
|
deviceHandle.size = pimpl_->size;
|
|
return deviceHandle;
|
|
}
|
|
|
|
} // namespace mscclpp
|