mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
Cuda smart pointers
This commit is contained in:
@@ -5,25 +5,9 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <mscclpp/errors.hpp>
|
||||
#include <string>
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define MSCCLPPTHROW(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
mscclpp::ErrorCode err = mscclpp::ErrorCode::InternalError; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
if (res == mscclppInvalidUsage) { \
|
||||
err = mscclpp::ErrorCode::InvalidUsage; \
|
||||
} else if (res == mscclppSystemError) { \
|
||||
err = mscclpp::ErrorCode::SystemError; \
|
||||
} \
|
||||
throw mscclpp::Error(std::string("Call to " #call " failed. ") + __FILE__ + ":" + std::to_string(__LINE__), \
|
||||
err); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define CUDATHROW(cmd) \
|
||||
#define MSCCLPP_CUDATHROW(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
@@ -32,7 +16,7 @@
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define CUTHROW(cmd) \
|
||||
#define MSCCLPP_CUTHROW(cmd) \
|
||||
do { \
|
||||
CUresult err = cmd; \
|
||||
if (err != CUDA_SUCCESS) { \
|
||||
109
include/mscclpp/cuda_utils.hpp
Normal file
109
include/mscclpp/cuda_utils.hpp
Normal file
@@ -0,0 +1,109 @@
|
||||
#ifndef MSCCLPP_CUDA_UTILS_HPP_
|
||||
#define MSCCLPP_CUDA_UTILS_HPP_
|
||||
|
||||
// #include <type_traits>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <mscclpp/checks.hpp>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
struct AvoidCudaGraphCaptureGuard {
|
||||
AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
|
||||
MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
|
||||
}
|
||||
~AvoidCudaGraphCaptureGuard() { cudaThreadExchangeStreamCaptureMode(&mode_); }
|
||||
cudaStreamCaptureMode mode_;
|
||||
};
|
||||
|
||||
struct CudaStreamWithFlags {
|
||||
CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); }
|
||||
~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
|
||||
operator cudaStream_t() const { return stream_; }
|
||||
cudaStream_t stream_;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <class T>
|
||||
T* cudaCalloc(size_t nelem) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
T* ptr;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T* cudaHostCalloc(size_t nelem) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
T* ptr;
|
||||
MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined));
|
||||
memset(ptr, 0, nelem * sizeof(T));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
template <class T, T*(alloc)(size_t), class Deleter, class Memory>
|
||||
Memory safeMake(size_t nelem) {
|
||||
T* ptr = nullptr;
|
||||
try {
|
||||
ptr = alloc(nelem);
|
||||
} catch (...) {
|
||||
if (ptr) {
|
||||
Deleter()(ptr);
|
||||
}
|
||||
}
|
||||
return Memory(ptr, Deleter());
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class T>
|
||||
struct CudaDeleter {
|
||||
void operator()(T* ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFree(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct CudaHostDeleter {
|
||||
void operator()(T* ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
std::shared_ptr<T> makeSharedCuda(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
|
||||
|
||||
template <class T>
|
||||
UniqueCudaPtr<T> makeUniqueCuda(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
std::shared_ptr<T> makeSharedCudaHost(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(count);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
|
||||
|
||||
template <class T>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_CUDA_UTILS_HPP_
|
||||
@@ -1,7 +1,9 @@
|
||||
#ifndef MSCCLPP_EPOCH_HPP_
|
||||
#define MSCCLPP_EPOCH_HPP_
|
||||
|
||||
#include <memory>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
@@ -10,6 +12,7 @@ struct alignas(16) EpochIds {
|
||||
uint64_t inboundReplica;
|
||||
};
|
||||
|
||||
template <template <typename> typename Deleter>
|
||||
class BaseEpoch {
|
||||
private:
|
||||
std::shared_ptr<Connection> connection_;
|
||||
@@ -17,21 +20,31 @@ class BaseEpoch {
|
||||
NonblockingFuture<RegisteredMemory> remoteEpochIdsRegMem_;
|
||||
|
||||
protected:
|
||||
EpochIds* epochIds_;
|
||||
uint64_t* expectedInboundEpochId_;
|
||||
std::unique_ptr<EpochIds, Deleter<EpochIds>> epochIds_;
|
||||
std::unique_ptr<uint64_t, Deleter<uint64_t>> expectedInboundEpochId_;
|
||||
|
||||
public:
|
||||
BaseEpoch(std::shared_ptr<Connection> connection);
|
||||
void setup(Communicator& communicator);
|
||||
BaseEpoch(const BaseEpoch&) = delete;
|
||||
void signal();
|
||||
BaseEpoch(std::shared_ptr<Connection> connection, std::unique_ptr<EpochIds, Deleter<EpochIds>> epochIds,
|
||||
std::unique_ptr<uint64_t, Deleter<uint64_t>> expectedInboundEpochId)
|
||||
: connection_(connection),
|
||||
epochIds_(std::move(epochIds)),
|
||||
expectedInboundEpochId_(std::move(expectedInboundEpochId)) {}
|
||||
|
||||
void setup(Communicator& communicator) {
|
||||
localEpochIdsRegMem_ = communicator.registerMemory(epochIds_.get(), sizeof(epochIds_), connection_->transport());
|
||||
communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection_->remoteRank(), connection_->tag());
|
||||
remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection_->remoteRank(), connection_->tag());
|
||||
}
|
||||
|
||||
void signal() {
|
||||
connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica), localEpochIdsRegMem_,
|
||||
offsetof(EpochIds, outbound), sizeof(epochIds_));
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceEpoch : BaseEpoch {
|
||||
class DeviceEpoch : BaseEpoch<CudaDeleter> {
|
||||
public:
|
||||
DeviceEpoch(Communicator& communicator, std::shared_ptr<Connection> connection);
|
||||
DeviceEpoch(const DeviceEpoch&) = delete;
|
||||
~DeviceEpoch();
|
||||
void signal();
|
||||
|
||||
struct DeviceHandle {
|
||||
@@ -52,13 +65,11 @@ class DeviceEpoch : BaseEpoch {
|
||||
DeviceHandle deviceHandle();
|
||||
};
|
||||
|
||||
class HostEpoch : BaseEpoch {
|
||||
class HostEpoch : BaseEpoch<std::default_delete> {
|
||||
public:
|
||||
HostEpoch(Communicator& communicator, std::shared_ptr<Connection> connection);
|
||||
HostEpoch(const HostEpoch&) = delete;
|
||||
~HostEpoch();
|
||||
|
||||
void increamentAndSignal();
|
||||
void incrementAndSignal();
|
||||
void wait();
|
||||
};
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "api.h"
|
||||
#include "checks.hpp"
|
||||
#include "checks_internal.hpp"
|
||||
#include "socket.h"
|
||||
#include "utils.h"
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include <mscclpp/channel.hpp>
|
||||
|
||||
#include "api.h"
|
||||
#include "checks.hpp"
|
||||
#include "checks_internal.hpp"
|
||||
#include "debug.h"
|
||||
#include "utils.h"
|
||||
|
||||
@@ -12,7 +12,7 @@ MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communi
|
||||
: communicator_(communicator),
|
||||
proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
|
||||
int cudaDevice;
|
||||
CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
MSCCLPPTHROW(getDeviceNumaNode(cudaDevice, &deviceNumaNode));
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <sstream>
|
||||
|
||||
#include "api.h"
|
||||
#include "checks.hpp"
|
||||
#include "checks_internal.hpp"
|
||||
#include "connection.hpp"
|
||||
#include "debug.h"
|
||||
#include "registered_memory.hpp"
|
||||
@@ -19,7 +19,7 @@ Communicator::Impl::Impl(std::shared_ptr<BaseBootstrap> bootstrap) : bootstrap_(
|
||||
rankToHash_[bootstrap->getRank()] = hostHash;
|
||||
bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t));
|
||||
|
||||
CUDATHROW(cudaStreamCreateWithFlags(&ipcStream_, cudaStreamNonBlocking));
|
||||
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&ipcStream_, cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
Communicator::Impl::~Impl() {
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "checks.hpp"
|
||||
#include "checks_internal.hpp"
|
||||
#include "debug.h"
|
||||
#include "infiniband/verbs.h"
|
||||
#include "npkit/npkit.h"
|
||||
#include "registered_memory.hpp"
|
||||
@@ -47,14 +48,14 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register
|
||||
char* dstPtr = (char*)dst.data();
|
||||
char* srcPtr = (char*)src.data();
|
||||
|
||||
CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream_));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream_));
|
||||
INFO(MSCCLPP_P2P, "CudaIpcConnection write: from %p to %p, size %lu", srcPtr + srcOffset, dstPtr + dstOffset, size);
|
||||
|
||||
// npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size);
|
||||
}
|
||||
|
||||
void CudaIpcConnection::flush() {
|
||||
CUDATHROW(cudaStreamSynchronize(stream_));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream_));
|
||||
// npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT);
|
||||
}
|
||||
|
||||
|
||||
39
src/epoch.cc
39
src/epoch.cc
@@ -1,61 +1,32 @@
|
||||
#include <mscclpp/epoch.hpp>
|
||||
|
||||
#include "alloc.h"
|
||||
#include "api.h"
|
||||
#include "checks.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
BaseEpoch::BaseEpoch(std::shared_ptr<Connection> connection) : connection_(connection) {}
|
||||
|
||||
void BaseEpoch::setup(Communicator& communicator) {
|
||||
localEpochIdsRegMem_ = communicator.registerMemory(epochIds_, sizeof(epochIds_), connection_->transport());
|
||||
communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection_->remoteRank(), connection_->tag());
|
||||
remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection_->remoteRank(), connection_->tag());
|
||||
}
|
||||
|
||||
void BaseEpoch::signal() {
|
||||
connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica), localEpochIdsRegMem_,
|
||||
offsetof(EpochIds, outbound), sizeof(epochIds_));
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr<Connection> connection)
|
||||
: BaseEpoch(connection) {
|
||||
MSCCLPPTHROW(mscclppCudaCalloc(&epochIds_, 1));
|
||||
MSCCLPPTHROW(mscclppCudaCalloc(&expectedInboundEpochId_, 1));
|
||||
: BaseEpoch(connection, makeUniqueCuda<EpochIds>(), makeUniqueCuda<uint64_t>()) {
|
||||
setup(communicator);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP DeviceEpoch::~DeviceEpoch() {
|
||||
mscclppCudaFree(epochIds_);
|
||||
mscclppCudaFree(expectedInboundEpochId_);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP void DeviceEpoch::signal() { BaseEpoch::signal(); }
|
||||
|
||||
MSCCLPP_API_CPP DeviceEpoch::DeviceHandle DeviceEpoch::deviceHandle() {
|
||||
DeviceEpoch::DeviceHandle device;
|
||||
device.epochIds = epochIds_;
|
||||
device.expectedInboundEpochId = expectedInboundEpochId_;
|
||||
device.epochIds = epochIds_.get();
|
||||
device.expectedInboundEpochId = expectedInboundEpochId_.get();
|
||||
return device;
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP HostEpoch::HostEpoch(Communicator& communicator, std::shared_ptr<Connection> connection)
|
||||
: BaseEpoch(connection) {
|
||||
: BaseEpoch(connection, std::make_unique<EpochIds>(), std::make_unique<uint64_t>()) {
|
||||
if (connection->transport() == Transport::CudaIpc) {
|
||||
throw Error("HostEpoch cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
|
||||
}
|
||||
epochIds_ = new EpochIds();
|
||||
expectedInboundEpochId_ = new uint64_t();
|
||||
setup(communicator);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP HostEpoch::~HostEpoch() {
|
||||
delete epochIds_;
|
||||
delete expectedInboundEpochId_;
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP void HostEpoch::increamentAndSignal() {
|
||||
MSCCLPP_API_CPP void HostEpoch::incrementAndSignal() {
|
||||
*(volatile uint64_t*)&(epochIds_->outbound) += 1;
|
||||
signal();
|
||||
}
|
||||
|
||||
52
src/fifo.cc
52
src/fifo.cc
@@ -1,17 +1,19 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
#include <mscclpp/fifo.hpp>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "alloc.h"
|
||||
#include "api.h"
|
||||
#include "checks.hpp"
|
||||
#include "checks_internal.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
struct HostProxyFifo::Impl {
|
||||
DeviceProxyFifo deviceFifo;
|
||||
UniqueCudaHostPtr<ProxyTrigger> triggers;
|
||||
UniqueCudaPtr<uint64_t> head;
|
||||
UniqueCudaPtr<uint64_t> tailReplica;
|
||||
|
||||
// allocated on the host. Only accessed by the host. This is a copy of the
|
||||
// value pointed to by fifoTailDev and the invariant is that
|
||||
@@ -23,32 +25,26 @@ struct HostProxyFifo::Impl {
|
||||
uint64_t hostTail;
|
||||
|
||||
// for transferring fifo tail
|
||||
cudaStream_t stream;
|
||||
CudaStreamWithFlags stream;
|
||||
|
||||
Impl()
|
||||
: triggers(makeUniqueCudaHost<ProxyTrigger>(MSCCLPP_PROXY_FIFO_SIZE)),
|
||||
head(makeUniqueCuda<uint64_t>(1)),
|
||||
tailReplica(makeUniqueCuda<uint64_t>(1)),
|
||||
hostTail(0),
|
||||
stream(cudaStreamNonBlocking) {}
|
||||
};
|
||||
|
||||
MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() {
|
||||
pimpl = std::make_unique<Impl>();
|
||||
MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.head, 1));
|
||||
MSCCLPPTHROW(mscclppCudaHostCalloc(&pimpl->deviceFifo.triggers, MSCCLPP_PROXY_FIFO_SIZE));
|
||||
MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.tailReplica, 1));
|
||||
CUDATHROW(cudaStreamCreateWithFlags(&pimpl->stream, cudaStreamNonBlocking));
|
||||
pimpl->hostTail = 0;
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP HostProxyFifo::~HostProxyFifo() {
|
||||
mscclppCudaFree(pimpl->deviceFifo.head);
|
||||
mscclppCudaHostFree(pimpl->deviceFifo.triggers);
|
||||
mscclppCudaFree(pimpl->deviceFifo.tailReplica);
|
||||
cudaStreamDestroy(pimpl->stream);
|
||||
}
|
||||
MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() : pimpl(std::make_unique<Impl>()) {}
|
||||
MSCCLPP_API_CPP HostProxyFifo::~HostProxyFifo() = default;
|
||||
|
||||
MSCCLPP_API_CPP void HostProxyFifo::poll(ProxyTrigger* trigger) {
|
||||
__m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]);
|
||||
__m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->triggers.get()[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]);
|
||||
_mm_store_si128((__m128i*)trigger, xmm0);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP void HostProxyFifo::pop() {
|
||||
*(volatile uint64_t*)(&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
*(volatile uint64_t*)(&pimpl->triggers.get()[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
(pimpl->hostTail)++;
|
||||
}
|
||||
|
||||
@@ -56,13 +52,19 @@ MSCCLPP_API_CPP void HostProxyFifo::flushTail(bool sync) {
|
||||
// Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure
|
||||
// that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush
|
||||
// request.
|
||||
CUDATHROW(cudaMemcpyAsync(pimpl->deviceFifo.tailReplica, &pimpl->hostTail, sizeof(uint64_t), cudaMemcpyHostToDevice,
|
||||
pimpl->stream));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t),
|
||||
cudaMemcpyHostToDevice, pimpl->stream));
|
||||
if (sync) {
|
||||
CUDATHROW(cudaStreamSynchronize(pimpl->stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(pimpl->stream));
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP DeviceProxyFifo HostProxyFifo::deviceFifo() { return pimpl->deviceFifo; }
|
||||
MSCCLPP_API_CPP DeviceProxyFifo HostProxyFifo::deviceFifo() {
|
||||
DeviceProxyFifo deviceFifo;
|
||||
deviceFifo.triggers = pimpl->triggers.get();
|
||||
deviceFifo.head = pimpl->head.get();
|
||||
deviceFifo.tailReplica = pimpl->tailReplica.get();
|
||||
return deviceFifo;
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#include <string>
|
||||
|
||||
#include "api.h"
|
||||
#include "checks.hpp"
|
||||
#include "checks_internal.hpp"
|
||||
#include "debug.h"
|
||||
|
||||
#define MAXCONNECTIONS 64
|
||||
|
||||
21
src/include/checks_internal.hpp
Normal file
21
src/include/checks_internal.hpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#ifndef MSCCLPP_CHECKS_OLD_HPP_
|
||||
#define MSCCLPP_CHECKS_OLD_HPP_
|
||||
|
||||
#include <mscclpp/checks.hpp>
|
||||
|
||||
#define MSCCLPPTHROW(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
mscclpp::ErrorCode err = mscclpp::ErrorCode::InternalError; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
if (res == mscclppInvalidUsage) { \
|
||||
err = mscclpp::ErrorCode::InvalidUsage; \
|
||||
} else if (res == mscclppSystemError) { \
|
||||
err = mscclpp::ErrorCode::SystemError; \
|
||||
} \
|
||||
throw mscclpp::Error(std::string("Call to " #call " failed. ") + __FILE__ + ":" + std::to_string(__LINE__), \
|
||||
err); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include "api.h"
|
||||
#include "checks.hpp"
|
||||
#include "checks_internal.hpp"
|
||||
#include "utils.h"
|
||||
|
||||
namespace mscclpp {
|
||||
@@ -19,8 +19,8 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t
|
||||
|
||||
void* baseDataPtr;
|
||||
size_t baseDataSize; // dummy
|
||||
CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data));
|
||||
CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
|
||||
MSCCLPP_CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data));
|
||||
MSCCLPP_CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
|
||||
// TODO: bug with offset of base?
|
||||
transportInfo.cudaIpcBaseHandle = handle;
|
||||
transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
|
||||
@@ -133,7 +133,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
|
||||
if (localHostHash == this->hostHash) {
|
||||
auto entry = getTransportInfo(Transport::CudaIpc);
|
||||
void* base;
|
||||
CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
|
||||
MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
|
||||
data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
|
||||
INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", data);
|
||||
}
|
||||
|
||||
@@ -19,6 +19,6 @@ configure_file(run_mpi_test.sh.in run_mpi_test.sh)
|
||||
|
||||
# Unit tests
|
||||
add_executable(unit_tests)
|
||||
target_link_libraries(unit_tests GTest::gtest_main GTest::gmock_main mscclpp)
|
||||
target_link_libraries(unit_tests GTest::gtest_main GTest::gmock_main mscclpp CUDA::cudart CUDA::cuda_driver)
|
||||
add_subdirectory(unit) # This adds the sources to the mscclpp target
|
||||
gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
|
||||
|
||||
@@ -271,7 +271,7 @@ void test_write_with_host_epochs(int rank, int worldSize, int nRanksPerNode, int
|
||||
|
||||
for (int i = 0; i < worldSize; i++) {
|
||||
if (i != rank && connections[i]->transport() != mscclpp::Transport::CudaIpc) {
|
||||
epochs[i]->increamentAndSignal();
|
||||
epochs[i]->incrementAndSignal();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "alloc.h"
|
||||
#include "checks.h"
|
||||
#include "ib.hpp"
|
||||
#include "infiniband/verbs.h"
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
#include <array>
|
||||
#include <string>
|
||||
|
||||
@@ -33,16 +33,15 @@ int main(int argc, const char* argv[])
|
||||
|
||||
CUDACHECK(cudaSetDevice(cudaDevId));
|
||||
|
||||
int* data;
|
||||
int nelem = 1;
|
||||
MSCCLPPCHECK(mscclppCudaCalloc(&data, nelem));
|
||||
auto data = mscclpp::makeUniqueCuda<int>(nelem);
|
||||
|
||||
std::shared_ptr<mscclpp::Bootstrap> bootstrap(new mscclpp::Bootstrap(isSend, 2));
|
||||
bootstrap->initialize(ipPortPair);
|
||||
|
||||
mscclpp::IbCtx ctx(ibDevName);
|
||||
mscclpp::IbQp* qp = ctx.createQp();
|
||||
const mscclpp::IbMr* mr = ctx.registerMr(data, sizeof(int) * nelem);
|
||||
const mscclpp::IbMr* mr = ctx.registerMr(data.get(), sizeof(int) * nelem);
|
||||
|
||||
std::array<mscclpp::IbQpInfo, 2> qpInfo;
|
||||
qpInfo[isSend] = qp->getInfo();
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
target_sources(unit_tests PRIVATE
|
||||
core_tests.cc
|
||||
cuda_memory_tests.cc
|
||||
)
|
||||
|
||||
12
test/unit/cuda_memory_tests.cc
Normal file
12
test/unit/cuda_memory_tests.cc
Normal file
@@ -0,0 +1,12 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
|
||||
TEST(CudaMemoryTest, Shared) {
|
||||
auto p1 = mscclpp::makeSharedCuda<uint32_t>();
|
||||
auto p2 = mscclpp::makeSharedCuda<int64_t>(5);
|
||||
}
|
||||
|
||||
TEST(CudaMemoryTest, Unique) {
|
||||
auto p1 = mscclpp::makeUniqueCuda<uint32_t>();
|
||||
auto p2 = mscclpp::makeUniqueCuda<int64_t>(5);
|
||||
}
|
||||
Reference in New Issue
Block a user