From 4e4d1972e3027691b831a94e2ecdf5caeca29af4 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 11 May 2023 00:03:08 +0000 Subject: [PATCH] Cuda smart pointers --- {src/include => include/mscclpp}/checks.hpp | 22 +--- include/mscclpp/cuda_utils.hpp | 109 ++++++++++++++++++++ include/mscclpp/epoch.hpp | 37 ++++--- src/bootstrap/bootstrap.cc | 2 +- src/channel.cc | 4 +- src/communicator.cc | 4 +- src/connection.cc | 7 +- src/epoch.cc | 39 +------ src/fifo.cc | 52 +++++----- src/ib.cc | 2 +- src/include/checks_internal.hpp | 21 ++++ src/registered_memory.cc | 8 +- test/CMakeLists.txt | 2 +- test/communicator_test_cpp.cu | 2 +- test/ib_test.cc | 7 +- test/unit/CMakeLists.txt | 1 + test/unit/cuda_memory_tests.cc | 12 +++ 17 files changed, 221 insertions(+), 110 deletions(-) rename {src/include => include/mscclpp}/checks.hpp (50%) create mode 100644 include/mscclpp/cuda_utils.hpp create mode 100644 src/include/checks_internal.hpp create mode 100644 test/unit/cuda_memory_tests.cc diff --git a/src/include/checks.hpp b/include/mscclpp/checks.hpp similarity index 50% rename from src/include/checks.hpp rename to include/mscclpp/checks.hpp index 00acc2f3..522748d8 100644 --- a/src/include/checks.hpp +++ b/include/mscclpp/checks.hpp @@ -5,25 +5,9 @@ #include #include +#include -#include "debug.h" - -#define MSCCLPPTHROW(call) \ - do { \ - mscclppResult_t res = call; \ - mscclpp::ErrorCode err = mscclpp::ErrorCode::InternalError; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - if (res == mscclppInvalidUsage) { \ - err = mscclpp::ErrorCode::InvalidUsage; \ - } else if (res == mscclppSystemError) { \ - err = mscclpp::ErrorCode::SystemError; \ - } \ - throw mscclpp::Error(std::string("Call to " #call " failed. ") + __FILE__ + ":" + std::to_string(__LINE__), \ - err); \ - } \ - } while (false) - -#define CUDATHROW(cmd) \ +#define MSCCLPP_CUDATHROW(cmd) \ do { \ cudaError_t err = cmd; \ if (err != cudaSuccess) { \ @@ -32,7 +16,7 @@ } \ } while (false) -#define CUTHROW(cmd) \ +#define MSCCLPP_CUTHROW(cmd) \ do { \ CUresult err = cmd; \ if (err != CUDA_SUCCESS) { \ diff --git a/include/mscclpp/cuda_utils.hpp b/include/mscclpp/cuda_utils.hpp new file mode 100644 index 00000000..09ca2a69 --- /dev/null +++ b/include/mscclpp/cuda_utils.hpp @@ -0,0 +1,109 @@ +#ifndef MSCCLPP_CUDA_UTILS_HPP_ +#define MSCCLPP_CUDA_UTILS_HPP_ + +// #include +#include + +#include +#include +#include + +namespace mscclpp { + +struct AvoidCudaGraphCaptureGuard { + AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) { + MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_)); + } + ~AvoidCudaGraphCaptureGuard() { cudaThreadExchangeStreamCaptureMode(&mode_); } + cudaStreamCaptureMode mode_; +}; + +struct CudaStreamWithFlags { + CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); } + ~CudaStreamWithFlags() { cudaStreamDestroy(stream_); } + operator cudaStream_t() const { return stream_; } + cudaStream_t stream_; +}; + +namespace detail { + +template +T* cudaCalloc(size_t nelem) { + AvoidCudaGraphCaptureGuard cgcGuard; + T* ptr; + CudaStreamWithFlags stream(cudaStreamNonBlocking); + MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T))); + MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); + return ptr; +} + +template +T* cudaHostCalloc(size_t nelem) { + AvoidCudaGraphCaptureGuard cgcGuard; + T* ptr; + MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined)); + memset(ptr, 0, nelem * sizeof(T)); + return ptr; +} + +template +Memory safeMake(size_t nelem) { + T* ptr = nullptr; + try { + ptr = alloc(nelem); + } catch (...) { + if (ptr) { + Deleter()(ptr); + } + } + return Memory(ptr, Deleter()); +} + +} // namespace detail + +template +struct CudaDeleter { + void operator()(T* ptr) { + AvoidCudaGraphCaptureGuard cgcGuard; + MSCCLPP_CUDATHROW(cudaFree(ptr)); + } +}; + +template +struct CudaHostDeleter { + void operator()(T* ptr) { + AvoidCudaGraphCaptureGuard cgcGuard; + MSCCLPP_CUDATHROW(cudaFreeHost(ptr)); + } +}; + +template +std::shared_ptr makeSharedCuda(size_t count = 1) { + return detail::safeMake, CudaDeleter, std::shared_ptr>(count); +} + +template +using UniqueCudaPtr = std::unique_ptr>; + +template +UniqueCudaPtr makeUniqueCuda(size_t count = 1) { + return detail::safeMake, CudaDeleter, UniqueCudaPtr>(count); +} + +template +std::shared_ptr makeSharedCudaHost(size_t count = 1) { + return detail::safeMake, CudaHostDeleter, std::shared_ptr>(count); +} + +template +using UniqueCudaHostPtr = std::unique_ptr>; + +template +UniqueCudaHostPtr makeUniqueCudaHost(size_t count = 1) { + return detail::safeMake, CudaHostDeleter, UniqueCudaHostPtr>(count); +} + +} // namespace mscclpp + +#endif // MSCCLPP_CUDA_UTILS_HPP_ \ No newline at end of file diff --git a/include/mscclpp/epoch.hpp b/include/mscclpp/epoch.hpp index 539ad03f..42908376 100644 --- a/include/mscclpp/epoch.hpp +++ b/include/mscclpp/epoch.hpp @@ -1,7 +1,9 @@ #ifndef MSCCLPP_EPOCH_HPP_ #define MSCCLPP_EPOCH_HPP_ +#include #include +#include namespace mscclpp { @@ -10,6 +12,7 @@ struct alignas(16) EpochIds { uint64_t inboundReplica; }; +template