Cuda smart pointers

This commit is contained in:
Olli Saarikivi
2023-05-11 00:03:08 +00:00
parent 00d4896c25
commit 4e4d1972e3
17 changed files with 221 additions and 110 deletions

View File

@@ -5,25 +5,9 @@
#include <cuda_runtime.h>
#include <mscclpp/errors.hpp>
#include <string>
#include "debug.h"
#define MSCCLPPTHROW(call) \
do { \
mscclppResult_t res = call; \
mscclpp::ErrorCode err = mscclpp::ErrorCode::InternalError; \
if (res != mscclppSuccess && res != mscclppInProgress) { \
if (res == mscclppInvalidUsage) { \
err = mscclpp::ErrorCode::InvalidUsage; \
} else if (res == mscclppSystemError) { \
err = mscclpp::ErrorCode::SystemError; \
} \
throw mscclpp::Error(std::string("Call to " #call " failed. ") + __FILE__ + ":" + std::to_string(__LINE__), \
err); \
} \
} while (false)
#define CUDATHROW(cmd) \
#define MSCCLPP_CUDATHROW(cmd) \
do { \
cudaError_t err = cmd; \
if (err != cudaSuccess) { \
@@ -32,7 +16,7 @@
} \
} while (false)
#define CUTHROW(cmd) \
#define MSCCLPP_CUTHROW(cmd) \
do { \
CUresult err = cmd; \
if (err != CUDA_SUCCESS) { \

View File

@@ -0,0 +1,109 @@
#ifndef MSCCLPP_CUDA_UTILS_HPP_
#define MSCCLPP_CUDA_UTILS_HPP_
// #include <type_traits>
#include <cuda_runtime.h>
#include <cstring>
#include <memory>
#include <mscclpp/checks.hpp>
namespace mscclpp {
struct AvoidCudaGraphCaptureGuard {
AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
}
~AvoidCudaGraphCaptureGuard() { cudaThreadExchangeStreamCaptureMode(&mode_); }
cudaStreamCaptureMode mode_;
};
struct CudaStreamWithFlags {
CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); }
~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
operator cudaStream_t() const { return stream_; }
cudaStream_t stream_;
};
namespace detail {
template <class T>
T* cudaCalloc(size_t nelem) {
AvoidCudaGraphCaptureGuard cgcGuard;
T* ptr;
CudaStreamWithFlags stream(cudaStreamNonBlocking);
MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
return ptr;
}
template <class T>
T* cudaHostCalloc(size_t nelem) {
AvoidCudaGraphCaptureGuard cgcGuard;
T* ptr;
MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined));
memset(ptr, 0, nelem * sizeof(T));
return ptr;
}
template <class T, T*(alloc)(size_t), class Deleter, class Memory>
Memory safeMake(size_t nelem) {
T* ptr = nullptr;
try {
ptr = alloc(nelem);
} catch (...) {
if (ptr) {
Deleter()(ptr);
}
}
return Memory(ptr, Deleter());
}
} // namespace detail
template <class T>
struct CudaDeleter {
void operator()(T* ptr) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW(cudaFree(ptr));
}
};
template <class T>
struct CudaHostDeleter {
void operator()(T* ptr) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
}
};
template <class T>
std::shared_ptr<T> makeSharedCuda(size_t count = 1) {
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
}
template <class T>
using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
template <class T>
UniqueCudaPtr<T> makeUniqueCuda(size_t count = 1) {
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
}
template <class T>
std::shared_ptr<T> makeSharedCudaHost(size_t count = 1) {
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(count);
}
template <class T>
using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
template <class T>
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count = 1) {
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
}
} // namespace mscclpp
#endif // MSCCLPP_CUDA_UTILS_HPP_

View File

@@ -1,7 +1,9 @@
#ifndef MSCCLPP_EPOCH_HPP_
#define MSCCLPP_EPOCH_HPP_
#include <memory>
#include <mscclpp/core.hpp>
#include <mscclpp/cuda_utils.hpp>
namespace mscclpp {
@@ -10,6 +12,7 @@ struct alignas(16) EpochIds {
uint64_t inboundReplica;
};
template <template <typename> typename Deleter>
class BaseEpoch {
private:
std::shared_ptr<Connection> connection_;
@@ -17,21 +20,31 @@ class BaseEpoch {
NonblockingFuture<RegisteredMemory> remoteEpochIdsRegMem_;
protected:
EpochIds* epochIds_;
uint64_t* expectedInboundEpochId_;
std::unique_ptr<EpochIds, Deleter<EpochIds>> epochIds_;
std::unique_ptr<uint64_t, Deleter<uint64_t>> expectedInboundEpochId_;
public:
BaseEpoch(std::shared_ptr<Connection> connection);
void setup(Communicator& communicator);
BaseEpoch(const BaseEpoch&) = delete;
void signal();
BaseEpoch(std::shared_ptr<Connection> connection, std::unique_ptr<EpochIds, Deleter<EpochIds>> epochIds,
std::unique_ptr<uint64_t, Deleter<uint64_t>> expectedInboundEpochId)
: connection_(connection),
epochIds_(std::move(epochIds)),
expectedInboundEpochId_(std::move(expectedInboundEpochId)) {}
void setup(Communicator& communicator) {
localEpochIdsRegMem_ = communicator.registerMemory(epochIds_.get(), sizeof(epochIds_), connection_->transport());
communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection_->remoteRank(), connection_->tag());
remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection_->remoteRank(), connection_->tag());
}
void signal() {
connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica), localEpochIdsRegMem_,
offsetof(EpochIds, outbound), sizeof(epochIds_));
}
};
class DeviceEpoch : BaseEpoch {
class DeviceEpoch : BaseEpoch<CudaDeleter> {
public:
DeviceEpoch(Communicator& communicator, std::shared_ptr<Connection> connection);
DeviceEpoch(const DeviceEpoch&) = delete;
~DeviceEpoch();
void signal();
struct DeviceHandle {
@@ -52,13 +65,11 @@ class DeviceEpoch : BaseEpoch {
DeviceHandle deviceHandle();
};
class HostEpoch : BaseEpoch {
class HostEpoch : BaseEpoch<std::default_delete> {
public:
HostEpoch(Communicator& communicator, std::shared_ptr<Connection> connection);
HostEpoch(const HostEpoch&) = delete;
~HostEpoch();
void increamentAndSignal();
void incrementAndSignal();
void wait();
};

View File

@@ -11,7 +11,7 @@
#include <vector>
#include "api.h"
#include "checks.hpp"
#include "checks_internal.hpp"
#include "socket.h"
#include "utils.h"

View File

@@ -1,7 +1,7 @@
#include <mscclpp/channel.hpp>
#include "api.h"
#include "checks.hpp"
#include "checks_internal.hpp"
#include "debug.h"
#include "utils.h"
@@ -12,7 +12,7 @@ MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communi
: communicator_(communicator),
proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
int cudaDevice;
CUDATHROW(cudaGetDevice(&cudaDevice));
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
MSCCLPPTHROW(getDeviceNumaNode(cudaDevice, &deviceNumaNode));
}

View File

@@ -4,7 +4,7 @@
#include <sstream>
#include "api.h"
#include "checks.hpp"
#include "checks_internal.hpp"
#include "connection.hpp"
#include "debug.h"
#include "registered_memory.hpp"
@@ -19,7 +19,7 @@ Communicator::Impl::Impl(std::shared_ptr<BaseBootstrap> bootstrap) : bootstrap_(
rankToHash_[bootstrap->getRank()] = hostHash;
bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t));
CUDATHROW(cudaStreamCreateWithFlags(&ipcStream_, cudaStreamNonBlocking));
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&ipcStream_, cudaStreamNonBlocking));
}
Communicator::Impl::~Impl() {

View File

@@ -2,7 +2,8 @@
#include <algorithm>
#include "checks.hpp"
#include "checks_internal.hpp"
#include "debug.h"
#include "infiniband/verbs.h"
#include "npkit/npkit.h"
#include "registered_memory.hpp"
@@ -47,14 +48,14 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register
char* dstPtr = (char*)dst.data();
char* srcPtr = (char*)src.data();
CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream_));
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream_));
INFO(MSCCLPP_P2P, "CudaIpcConnection write: from %p to %p, size %lu", srcPtr + srcOffset, dstPtr + dstOffset, size);
// npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size);
}
void CudaIpcConnection::flush() {
CUDATHROW(cudaStreamSynchronize(stream_));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream_));
// npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT);
}

View File

@@ -1,61 +1,32 @@
#include <mscclpp/epoch.hpp>
#include "alloc.h"
#include "api.h"
#include "checks.hpp"
namespace mscclpp {
BaseEpoch::BaseEpoch(std::shared_ptr<Connection> connection) : connection_(connection) {}
void BaseEpoch::setup(Communicator& communicator) {
localEpochIdsRegMem_ = communicator.registerMemory(epochIds_, sizeof(epochIds_), connection_->transport());
communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection_->remoteRank(), connection_->tag());
remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection_->remoteRank(), connection_->tag());
}
void BaseEpoch::signal() {
connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica), localEpochIdsRegMem_,
offsetof(EpochIds, outbound), sizeof(epochIds_));
}
MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr<Connection> connection)
: BaseEpoch(connection) {
MSCCLPPTHROW(mscclppCudaCalloc(&epochIds_, 1));
MSCCLPPTHROW(mscclppCudaCalloc(&expectedInboundEpochId_, 1));
: BaseEpoch(connection, makeUniqueCuda<EpochIds>(), makeUniqueCuda<uint64_t>()) {
setup(communicator);
}
MSCCLPP_API_CPP DeviceEpoch::~DeviceEpoch() {
mscclppCudaFree(epochIds_);
mscclppCudaFree(expectedInboundEpochId_);
}
MSCCLPP_API_CPP void DeviceEpoch::signal() { BaseEpoch::signal(); }
MSCCLPP_API_CPP DeviceEpoch::DeviceHandle DeviceEpoch::deviceHandle() {
DeviceEpoch::DeviceHandle device;
device.epochIds = epochIds_;
device.expectedInboundEpochId = expectedInboundEpochId_;
device.epochIds = epochIds_.get();
device.expectedInboundEpochId = expectedInboundEpochId_.get();
return device;
}
MSCCLPP_API_CPP HostEpoch::HostEpoch(Communicator& communicator, std::shared_ptr<Connection> connection)
: BaseEpoch(connection) {
: BaseEpoch(connection, std::make_unique<EpochIds>(), std::make_unique<uint64_t>()) {
if (connection->transport() == Transport::CudaIpc) {
throw Error("HostEpoch cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
}
epochIds_ = new EpochIds();
expectedInboundEpochId_ = new uint64_t();
setup(communicator);
}
MSCCLPP_API_CPP HostEpoch::~HostEpoch() {
delete epochIds_;
delete expectedInboundEpochId_;
}
MSCCLPP_API_CPP void HostEpoch::increamentAndSignal() {
MSCCLPP_API_CPP void HostEpoch::incrementAndSignal() {
*(volatile uint64_t*)&(epochIds_->outbound) += 1;
signal();
}

View File

@@ -1,17 +1,19 @@
#include <cuda_runtime.h>
#include <emmintrin.h>
#include <mscclpp/cuda_utils.hpp>
#include <mscclpp/fifo.hpp>
#include <stdexcept>
#include "alloc.h"
#include "api.h"
#include "checks.hpp"
#include "checks_internal.hpp"
namespace mscclpp {
struct HostProxyFifo::Impl {
DeviceProxyFifo deviceFifo;
UniqueCudaHostPtr<ProxyTrigger> triggers;
UniqueCudaPtr<uint64_t> head;
UniqueCudaPtr<uint64_t> tailReplica;
// allocated on the host. Only accessed by the host. This is a copy of the
// value pointed to by fifoTailDev and the invariant is that
@@ -23,32 +25,26 @@ struct HostProxyFifo::Impl {
uint64_t hostTail;
// for transferring fifo tail
cudaStream_t stream;
CudaStreamWithFlags stream;
Impl()
: triggers(makeUniqueCudaHost<ProxyTrigger>(MSCCLPP_PROXY_FIFO_SIZE)),
head(makeUniqueCuda<uint64_t>(1)),
tailReplica(makeUniqueCuda<uint64_t>(1)),
hostTail(0),
stream(cudaStreamNonBlocking) {}
};
MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() {
pimpl = std::make_unique<Impl>();
MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.head, 1));
MSCCLPPTHROW(mscclppCudaHostCalloc(&pimpl->deviceFifo.triggers, MSCCLPP_PROXY_FIFO_SIZE));
MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.tailReplica, 1));
CUDATHROW(cudaStreamCreateWithFlags(&pimpl->stream, cudaStreamNonBlocking));
pimpl->hostTail = 0;
}
MSCCLPP_API_CPP HostProxyFifo::~HostProxyFifo() {
mscclppCudaFree(pimpl->deviceFifo.head);
mscclppCudaHostFree(pimpl->deviceFifo.triggers);
mscclppCudaFree(pimpl->deviceFifo.tailReplica);
cudaStreamDestroy(pimpl->stream);
}
MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() : pimpl(std::make_unique<Impl>()) {}
MSCCLPP_API_CPP HostProxyFifo::~HostProxyFifo() = default;
MSCCLPP_API_CPP void HostProxyFifo::poll(ProxyTrigger* trigger) {
__m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]);
__m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->triggers.get()[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]);
_mm_store_si128((__m128i*)trigger, xmm0);
}
MSCCLPP_API_CPP void HostProxyFifo::pop() {
*(volatile uint64_t*)(&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
*(volatile uint64_t*)(&pimpl->triggers.get()[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
(pimpl->hostTail)++;
}
@@ -56,13 +52,19 @@ MSCCLPP_API_CPP void HostProxyFifo::flushTail(bool sync) {
// Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure
// that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush
// request.
CUDATHROW(cudaMemcpyAsync(pimpl->deviceFifo.tailReplica, &pimpl->hostTail, sizeof(uint64_t), cudaMemcpyHostToDevice,
pimpl->stream));
MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t),
cudaMemcpyHostToDevice, pimpl->stream));
if (sync) {
CUDATHROW(cudaStreamSynchronize(pimpl->stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(pimpl->stream));
}
}
MSCCLPP_API_CPP DeviceProxyFifo HostProxyFifo::deviceFifo() { return pimpl->deviceFifo; }
MSCCLPP_API_CPP DeviceProxyFifo HostProxyFifo::deviceFifo() {
DeviceProxyFifo deviceFifo;
deviceFifo.triggers = pimpl->triggers.get();
deviceFifo.head = pimpl->head.get();
deviceFifo.tailReplica = pimpl->tailReplica.get();
return deviceFifo;
}
} // namespace mscclpp

View File

@@ -12,7 +12,7 @@
#include <string>
#include "api.h"
#include "checks.hpp"
#include "checks_internal.hpp"
#include "debug.h"
#define MAXCONNECTIONS 64

View File

@@ -0,0 +1,21 @@
#ifndef MSCCLPP_CHECKS_OLD_HPP_
#define MSCCLPP_CHECKS_OLD_HPP_
#include <mscclpp/checks.hpp>
#define MSCCLPPTHROW(call) \
do { \
mscclppResult_t res = call; \
mscclpp::ErrorCode err = mscclpp::ErrorCode::InternalError; \
if (res != mscclppSuccess && res != mscclppInProgress) { \
if (res == mscclppInvalidUsage) { \
err = mscclpp::ErrorCode::InvalidUsage; \
} else if (res == mscclppSystemError) { \
err = mscclpp::ErrorCode::SystemError; \
} \
throw mscclpp::Error(std::string("Call to " #call " failed. ") + __FILE__ + ":" + std::to_string(__LINE__), \
err); \
} \
} while (false)
#endif

View File

@@ -5,7 +5,7 @@
#include <algorithm>
#include "api.h"
#include "checks.hpp"
#include "checks_internal.hpp"
#include "utils.h"
namespace mscclpp {
@@ -19,8 +19,8 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t
void* baseDataPtr;
size_t baseDataSize; // dummy
CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data));
CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
MSCCLPP_CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data));
MSCCLPP_CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
// TODO: bug with offset of base?
transportInfo.cudaIpcBaseHandle = handle;
transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
@@ -133,7 +133,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
if (localHostHash == this->hostHash) {
auto entry = getTransportInfo(Transport::CudaIpc);
void* base;
CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", data);
}

View File

@@ -19,6 +19,6 @@ configure_file(run_mpi_test.sh.in run_mpi_test.sh)
# Unit tests
add_executable(unit_tests)
target_link_libraries(unit_tests GTest::gtest_main GTest::gmock_main mscclpp)
target_link_libraries(unit_tests GTest::gtest_main GTest::gmock_main mscclpp CUDA::cudart CUDA::cuda_driver)
add_subdirectory(unit) # This adds the sources to the mscclpp target
gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)

View File

@@ -271,7 +271,7 @@ void test_write_with_host_epochs(int rank, int worldSize, int nRanksPerNode, int
for (int i = 0; i < worldSize; i++) {
if (i != rank && connections[i]->transport() != mscclpp::Transport::CudaIpc) {
epochs[i]->increamentAndSignal();
epochs[i]->incrementAndSignal();
}
}

View File

@@ -1,8 +1,8 @@
#include "alloc.h"
#include "checks.h"
#include "ib.hpp"
#include "infiniband/verbs.h"
#include <mscclpp/core.hpp>
#include <mscclpp/cuda_utils.hpp>
#include <array>
#include <string>
@@ -33,16 +33,15 @@ int main(int argc, const char* argv[])
CUDACHECK(cudaSetDevice(cudaDevId));
int* data;
int nelem = 1;
MSCCLPPCHECK(mscclppCudaCalloc(&data, nelem));
auto data = mscclpp::makeUniqueCuda<int>(nelem);
std::shared_ptr<mscclpp::Bootstrap> bootstrap(new mscclpp::Bootstrap(isSend, 2));
bootstrap->initialize(ipPortPair);
mscclpp::IbCtx ctx(ibDevName);
mscclpp::IbQp* qp = ctx.createQp();
const mscclpp::IbMr* mr = ctx.registerMr(data, sizeof(int) * nelem);
const mscclpp::IbMr* mr = ctx.registerMr(data.get(), sizeof(int) * nelem);
std::array<mscclpp::IbQpInfo, 2> qpInfo;
qpInfo[isSend] = qp->getInfo();

View File

@@ -1,3 +1,4 @@
target_sources(unit_tests PRIVATE
core_tests.cc
cuda_memory_tests.cc
)

View File

@@ -0,0 +1,12 @@
#include <gtest/gtest.h>
#include <mscclpp/cuda_utils.hpp>
TEST(CudaMemoryTest, Shared) {
auto p1 = mscclpp::makeSharedCuda<uint32_t>();
auto p2 = mscclpp::makeSharedCuda<int64_t>(5);
}
TEST(CudaMemoryTest, Unique) {
auto p1 = mscclpp::makeUniqueCuda<uint32_t>();
auto p2 = mscclpp::makeUniqueCuda<int64_t>(5);
}