diff --git a/include/mscclpp/cuda_utils.hpp b/include/mscclpp/cuda_utils.hpp index 58e1ebe8..3e232247 100644 --- a/include/mscclpp/cuda_utils.hpp +++ b/include/mscclpp/cuda_utils.hpp @@ -10,6 +10,8 @@ namespace mscclpp { +// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and +// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture. struct AvoidCudaGraphCaptureGuard { AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) { MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_)); @@ -18,6 +20,7 @@ struct AvoidCudaGraphCaptureGuard { cudaStreamCaptureMode mode_; }; +// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction. struct CudaStreamWithFlags { CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); } ~CudaStreamWithFlags() { cudaStreamDestroy(stream_); } @@ -48,7 +51,7 @@ T* cudaHostCalloc(size_t nelem) { } template -Memory safeMake(size_t nelem) { +Memory safeAlloc(size_t nelem) { T* ptr = nullptr; try { ptr = alloc(nelem); @@ -63,46 +66,98 @@ Memory safeMake(size_t nelem) { } // namespace detail +// A deleter that calls cudaFree for use with std::unique_ptr/std::shared_ptr. template struct CudaDeleter { - void operator()(T* ptr) { + using TPtrOrArray = std::conditional_t, T, T*>; + void operator()(TPtrOrArray ptr) { AvoidCudaGraphCaptureGuard cgcGuard; MSCCLPP_CUDATHROW(cudaFree(ptr)); } }; +// A deleter that calls cudaFreeHost for use with std::unique_ptr/std::shared_ptr. template struct CudaHostDeleter { - void operator()(T* ptr) { + using TPtrOrArray = std::conditional_t, T, T*>; + void operator()(TPtrOrArray ptr) { AvoidCudaGraphCaptureGuard cgcGuard; MSCCLPP_CUDATHROW(cudaFreeHost(ptr)); } }; +// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out. template -std::shared_ptr makeSharedCuda(size_t count = 1) { - return detail::safeMake, CudaDeleter, std::shared_ptr>(count); +std::shared_ptr allocSharedCuda(size_t count = 1) { + return detail::safeAlloc, CudaDeleter, std::shared_ptr>(count); } template using UniqueCudaPtr = std::unique_ptr>; +// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out. template -UniqueCudaPtr makeUniqueCuda(size_t count = 1) { - return detail::safeMake, CudaDeleter, UniqueCudaPtr>(count); +UniqueCudaPtr allocUniqueCuda(size_t count = 1) { + return detail::safeAlloc, CudaDeleter, UniqueCudaPtr>(count); } +// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it. +template +std::shared_ptr makeSharedCudaHost(Args&&... args) { + auto ptr = detail::safeAlloc, CudaHostDeleter, std::shared_ptr>(1); + new (ptr.get()) T(std::forward(args)...); + return ptr; +} + +// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a +// std::shared_ptr to it. template -std::shared_ptr makeSharedCudaHost(size_t count = 1) { - return detail::safeMake, CudaHostDeleter, std::shared_ptr>(count); +std::shared_ptr makeSharedCudaHost(size_t count) { + using TElem = std::remove_extent_t; + auto ptr = detail::safeAlloc, CudaHostDeleter, std::shared_ptr>(count); + for (size_t i = 0; i < count; ++i) { + new (&ptr[i]) TElem(); + } + return ptr; } template using UniqueCudaHostPtr = std::unique_ptr>; +// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it. +template , bool> = true> +UniqueCudaHostPtr makeUniqueCudaHost(Args&&... args) { + auto ptr = detail::safeAlloc, CudaHostDeleter, UniqueCudaHostPtr>(1); + new (ptr.get()) T(std::forward(args)...); + return ptr; +} + +// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a +// std::unique_ptr to it. +template , bool> = true> +UniqueCudaHostPtr makeUniqueCudaHost(size_t count) { + using TElem = std::remove_extent_t; + auto ptr = detail::safeAlloc, CudaHostDeleter, UniqueCudaHostPtr>(count); + for (size_t i = 0; i < count; ++i) { + new (&ptr[i]) TElem(); + } + return ptr; +} + +// Asynchronous cudaMemcpy without capture into a CUDA graph. template -UniqueCudaHostPtr makeUniqueCudaHost(size_t count = 1) { - return detail::safeMake, CudaHostDeleter, UniqueCudaHostPtr>(count); +void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) { + AvoidCudaGraphCaptureGuard cgcGuard; + MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream)); +} + +// Synchronous cudaMemcpy without capture into a CUDA graph. +template +void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) { + AvoidCudaGraphCaptureGuard cgcGuard; + CudaStreamWithFlags stream(cudaStreamNonBlocking); + MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); } } // namespace mscclpp diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index c39095ce..dfd22d64 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -11,6 +11,7 @@ #include #include "api.h" +#include "checks.h" #include "checks_internal.hpp" #include "socket.h" #include "utils.h" diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index b60815b6..568eaa50 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -6,12 +6,16 @@ #include "socket.h" +#include #include #include #include +#include #include +#include "checks.h" #include "config.h" +#include "debug.h" #include "utils.h" static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset, diff --git a/src/epoch.cc b/src/epoch.cc index 3dd4cb84..96148114 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -5,7 +5,7 @@ namespace mscclpp { MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr connection) - : BaseEpoch(connection, makeUniqueCuda(), makeUniqueCuda()) { + : BaseEpoch(connection, allocUniqueCuda(), allocUniqueCuda()) { setup(communicator); } diff --git a/src/fifo.cc b/src/fifo.cc index 5dcf25f9..d7676926 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -11,7 +11,7 @@ namespace mscclpp { struct HostProxyFifo::Impl { - UniqueCudaHostPtr triggers; + UniqueCudaHostPtr triggers; UniqueCudaPtr head; UniqueCudaPtr tailReplica; @@ -28,9 +28,9 @@ struct HostProxyFifo::Impl { CudaStreamWithFlags stream; Impl() - : triggers(makeUniqueCudaHost(MSCCLPP_PROXY_FIFO_SIZE)), - head(makeUniqueCuda(1)), - tailReplica(makeUniqueCuda(1)), + : triggers(makeUniqueCudaHost(MSCCLPP_PROXY_FIFO_SIZE)), + head(allocUniqueCuda()), + tailReplica(allocUniqueCuda()), hostTail(0), stream(cudaStreamNonBlocking) {} }; diff --git a/src/include/alloc.h b/src/include/alloc.h deleted file mode 100644 index 5de23e87..00000000 --- a/src/include/alloc.h +++ /dev/null @@ -1,182 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef MSCCLPP_ALLOC_H_ -#define MSCCLPP_ALLOC_H_ - -#include -#include -#include - -#include "align.h" -#include "checks.h" -#include "mscclpp.h" -#include "utils.h" - -template -mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaHostAlloc(ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined), result, - finish); - memset(*ptr, 0, nelem * sizeof(T)); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T)); - INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); - return result; -} -#define mscclppCudaHostCalloc(...) mscclppCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -inline mscclppResult_t mscclppCudaHostFree(void* ptr) { - CUDACHECK(cudaFreeHost(ptr)); - return mscclppSuccess; -} - -template -mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { - void* p = malloc(nelem * sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem * sizeof(T)); - return mscclppSystemError; - } - INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p); - memset(p, 0, nelem * sizeof(T)); - *ptr = (T*)p; - return mscclppSuccess; -} -#define mscclppCalloc(...) mscclppCallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) { - if (nelem < oldNelem) return mscclppInternalError; - if (nelem == oldNelem) return mscclppSuccess; - - T* oldp = *ptr; - T* p = (T*)malloc(nelem * sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem * sizeof(T)); - return mscclppSystemError; - } - memcpy(p, oldp, oldNelem * sizeof(T)); - free(oldp); - memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T)); - *ptr = (T*)p; - INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T), - *ptr); - return mscclppSuccess; -} - -template -mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T)); - INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); - return result; -} -#define mscclppCudaMalloc(...) mscclppCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - // Need a side stream so as not to interfere with graph capture. - cudaStream_t stream; - CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish); - CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish); - CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); - CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T)); - INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); - return result; -} -#define mscclppCudaCalloc(...) mscclppCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc, - int line) { - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish); - CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T)); - INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); - return result; -} -#define mscclppCudaCallocAsync(...) mscclppCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) { - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - // Need a side stream so as not to interfere with graph capture. - cudaStream_t stream; - CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); - MSCCLPPCHECKGOTO(mscclppCudaMemcpyAsync(dst, src, nelem, stream), result, finish); - CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); - CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - return result; -} - -template -mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) { - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem * sizeof(T), cudaMemcpyDefault, stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - return result; -} - -template -mscclppResult_t mscclppCudaFree(T* ptr) { - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaFree(ptr), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - return result; -} - -// Allocate memory to be potentially ibv_reg_mr'd. This needs to be -// allocated on separate pages as those pages will be marked DONTFORK -// and if they are shared, that could cause a crash in a child process -inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) { - size_t page_size = sysconf(_SC_PAGESIZE); - void* p; - int size_aligned = ROUNDUP(size, page_size); - int ret = posix_memalign(&p, page_size, size_aligned); - if (ret != 0) return mscclppSystemError; - memset(p, 0, size); - *ptr = p; - INFO(MSCCLPP_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); - return mscclppSuccess; -} -#define mscclppIbMalloc(...) mscclppIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -#endif diff --git a/src/include/utils.h b/src/include/utils.h index f3318031..5dbca982 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -8,11 +8,11 @@ #define MSCCLPP_UTILS_H_ #include +#include #include -#include "alloc.h" -// #include "mscclpp.h" +#include "mscclpp.h" // int mscclppCudaCompCap(); diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc index 49ee7a12..a8fe79a1 100644 --- a/src/npkit/npkit.cc +++ b/src/npkit/npkit.cc @@ -6,49 +6,43 @@ #include #include -#include "alloc.h" - uint64_t NpKit::rank_ = 0; -NpKitEvent** NpKit::gpu_event_buffers_ = nullptr; -NpKitEvent** NpKit::cpu_event_buffers_ = nullptr; +std::vector> NpKit::gpu_event_buffers_; +std::vector> NpKit::cpu_event_buffers_; -NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr; -NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr; +mscclpp::UniqueCudaPtr NpKit::gpu_collect_contexts_; +std::unique_ptr NpKit::cpu_collect_contexts_; uint64_t NpKit::cpu_base_system_timestamp_ = 0; uint64_t NpKit::cpu_base_steady_timestamp_ = 0; -mscclppResult_t NpKit::Init(int rank) { +void NpKit::Init(int rank) { uint64_t i = 0; NpKitEventCollectContext ctx; ctx.event_buffer_head = 0; rank_ = rank; // Init event data structures - MSCCLPPCHECK(mscclppCalloc(&gpu_event_buffers_, kNumGpuEventBuffers)); - MSCCLPPCHECK(mscclppCudaCalloc(&gpu_collect_contexts_, kNumGpuEventBuffers)); + gpu_collect_contexts_ = mscclpp::allocUniqueCuda(kNumGpuEventBuffers); for (i = 0; i < kNumGpuEventBuffers; i++) { - MSCCLPPCHECK(mscclppCudaCalloc(gpu_event_buffers_ + i, kMaxNumGpuEventsPerBuffer)); - ctx.event_buffer = gpu_event_buffers_[i]; - MSCCLPPCHECK(mscclppCudaMemcpy(gpu_collect_contexts_ + i, &ctx, 1)); + gpu_event_buffers_.emplace_back(mscclpp::allocUniqueCuda(kMaxNumGpuEventsPerBuffer)); + ctx.event_buffer = gpu_event_buffers_[i].get(); + mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1); } - MSCCLPPCHECK(mscclppCalloc(&cpu_event_buffers_, kNumCpuEventBuffers)); - MSCCLPPCHECK(mscclppCalloc(&cpu_collect_contexts_, kNumCpuEventBuffers)); + cpu_collect_contexts_ = std::make_unique(kNumCpuEventBuffers); for (i = 0; i < kNumCpuEventBuffers; i++) { - MSCCLPPCHECK(mscclppCalloc(cpu_event_buffers_ + i, kMaxNumCpuEventsPerBuffer)); - ctx.event_buffer = cpu_event_buffers_[i]; + cpu_event_buffers_.emplace_back(std::make_unique(kMaxNumCpuEventsPerBuffer)); + ctx.event_buffer = cpu_event_buffers_[i].get(); cpu_collect_contexts_[i] = ctx; } // Init timestamp cpu_base_system_timestamp_ = std::chrono::system_clock::now().time_since_epoch().count(); cpu_base_steady_timestamp_ = std::chrono::steady_clock::now().time_since_epoch().count(); - - return mscclppSuccess; } -mscclppResult_t NpKit::Dump(const std::string& dump_dir) { +void NpKit::Dump(const std::string& dump_dir) { uint64_t i = 0; std::string dump_file_path; @@ -60,7 +54,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) { dump_file_path += "_channel_"; dump_file_path += std::to_string(i); auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary); - cpu_trace_file.write(reinterpret_cast(cpu_event_buffers_[i]), + cpu_trace_file.write(reinterpret_cast(cpu_event_buffers_[i].get()), cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent)); cpu_trace_file.close(); } @@ -89,10 +83,10 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) { dump_file_path += std::to_string(rank_); dump_file_path += "_buf_"; dump_file_path += std::to_string(i); - MSCCLPPCHECK(mscclppCudaMemcpy(cpu_event_buffers_[0], gpu_event_buffers_[i], kMaxNumGpuEventsPerBuffer)); - MSCCLPPCHECK(mscclppCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1)); + mscclpp::memcpyCuda(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer); + mscclpp::memcpyCuda(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1); auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary); - gpu_trace_file.write(reinterpret_cast(cpu_event_buffers_[0]), + gpu_trace_file.write(reinterpret_cast(cpu_event_buffers_[0].get()), cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent)); gpu_trace_file.close(); } @@ -103,37 +97,27 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) { dump_file_path += std::to_string(rank_); cudaDeviceProp dev_prop; int dev; - CUDACHECK(cudaGetDevice(&dev)); - CUDACHECK(cudaGetDeviceProperties(&dev_prop, dev)); + MSCCLPP_CUDATHROW(cudaGetDevice(&dev)); + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev)); std::string clock_rate_str = std::to_string(dev_prop.clockRate); auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out); gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length()); gpu_clock_rate_file.close(); - - return mscclppSuccess; } -mscclppResult_t NpKit::Shutdown() { +void NpKit::Shutdown() { uint64_t i = 0; // Free CPU event data structures - for (i = 0; i < kNumCpuEventBuffers; i++) { - free(cpu_event_buffers_[i]); - } - free(cpu_event_buffers_); - free(cpu_collect_contexts_); + cpu_event_buffers_.clear(); + cpu_collect_contexts_.reset(); // Free GPU event data structures - for (i = 0; i < kNumGpuEventBuffers; i++) { - CUDACHECK(cudaFree(gpu_event_buffers_[i])); - } - free(gpu_event_buffers_); - CUDACHECK(cudaFree(gpu_collect_contexts_)); - - return mscclppSuccess; + gpu_event_buffers_.clear(); + gpu_collect_contexts_.reset(); } -NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_; } +NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_.get(); } void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) { uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head; diff --git a/src/npkit/npkit.h b/src/npkit/npkit.h index c15bb812..a324776b 100644 --- a/src/npkit/npkit.h +++ b/src/npkit/npkit.h @@ -1,9 +1,10 @@ #ifndef NPKIT_H_ #define NPKIT_H_ +#include #include +#include -#include "mscclpp.h" #include "npkit_event.h" #include "npkit_struct.h" @@ -13,11 +14,11 @@ class NpKit { static const uint64_t kNumCpuEventBuffers = 32; - static mscclppResult_t Init(int rank); + static void Init(int rank); - static mscclppResult_t Dump(const std::string& dump_dir); + static void Dump(const std::string& dump_dir); - static mscclppResult_t Shutdown(); + static void Shutdown(); static NpKitEventCollectContext* GetGpuEventCollectContexts(); @@ -47,11 +48,11 @@ class NpKit { // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21; - static NpKitEvent** gpu_event_buffers_; - static NpKitEvent** cpu_event_buffers_; + static std::vector> gpu_event_buffers_; + static std::vector> cpu_event_buffers_; - static NpKitEventCollectContext* gpu_collect_contexts_; - static NpKitEventCollectContext* cpu_collect_contexts_; + static mscclpp::UniqueCudaPtr gpu_collect_contexts_; + static std::unique_ptr cpu_collect_contexts_; static uint64_t cpu_base_system_timestamp_; static uint64_t cpu_base_steady_timestamp_; diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 7e42c79c..66853d99 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -6,6 +6,7 @@ #include "api.h" #include "checks_internal.hpp" +#include "debug.h" #include "utils.h" namespace mscclpp { diff --git a/src/utils.cc b/src/utils.cc index 6e9e1970..60a9efec 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -6,12 +6,16 @@ #include "utils.h" +#include #include #include +#include #include #include +#include "checks.h" + // Get current Compute Capability // int mscclppCudaCompCap() { // int cudaDev; diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index f142c81d..63c72c2f 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include #ifdef MSCCLPP_USE_MPI_FOR_TESTS @@ -392,9 +393,9 @@ int main(int argc, char* argv[]) printf("Stopping MSCCL++ proxy threads\n"); proxyService.stop(); - CUDACHECK(cudaFree(data_d)); - CUDACHECK(cudaFree(deviceHandles1)); - CUDACHECK(cudaFree(deviceHandles2)); + MSCCLPP_CUDATHROW(cudaFree(data_d)); + MSCCLPP_CUDATHROW(cudaFree(deviceHandles1)); + MSCCLPP_CUDATHROW(cudaFree(deviceHandles2)); #ifdef MSCCLPP_USE_MPI_FOR_TESTS MPI_Finalize(); diff --git a/test/ib_test.cc b/test/ib_test.cc index 48ec4c19..8f9f3dc7 100644 --- a/test/ib_test.cc +++ b/test/ib_test.cc @@ -34,7 +34,7 @@ int main(int argc, const char* argv[]) CUDACHECK(cudaSetDevice(cudaDevId)); int nelem = 1; - auto data = mscclpp::makeUniqueCuda(nelem); + auto data = mscclpp::allocUniqueCuda(nelem); std::shared_ptr bootstrap(new mscclpp::Bootstrap(isSend, 2)); bootstrap->initialize(ipPortPair); diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 0517f7c9..89b66221 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -200,7 +200,7 @@ class AllGatherTestEngine : public BaseTestEngine { }; void AllGatherTestEngine::allocateBuffer() { - sendBuff_ = mscclpp::makeSharedCuda(args_.maxBytes / sizeof(int)); + sendBuff_ = mscclpp::allocSharedCuda(args_.maxBytes / sizeof(int)); expectedBuff_ = std::shared_ptr(new int[args_.maxBytes / sizeof(int)]); } diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu index 6b885976..377f7799 100644 --- a/test/mscclpp-test/sendrecv_test.cu +++ b/test/mscclpp-test/sendrecv_test.cu @@ -119,8 +119,8 @@ class SendRecvTestEngine : public BaseTestEngine { }; void SendRecvTestEngine::allocateBuffer() { - std::shared_ptr sendBuff = mscclpp::makeSharedCuda(args_.maxBytes / sizeof(int)); - std::shared_ptr recvBuff = mscclpp::makeSharedCuda(args_.maxBytes / sizeof(int)); + std::shared_ptr sendBuff = mscclpp::allocSharedCuda(args_.maxBytes / sizeof(int)); + std::shared_ptr recvBuff = mscclpp::allocSharedCuda(args_.maxBytes / sizeof(int)); devicePtrs_.push_back(sendBuff); devicePtrs_.push_back(recvBuff); diff --git a/test/unit/cuda_memory_tests.cc b/test/unit/cuda_memory_tests.cc index 3d7bcefd..50bea575 100644 --- a/test/unit/cuda_memory_tests.cc +++ b/test/unit/cuda_memory_tests.cc @@ -2,11 +2,11 @@ #include TEST(CudaMemoryTest, Shared) { - auto p1 = mscclpp::makeSharedCuda(); - auto p2 = mscclpp::makeSharedCuda(5); + auto p1 = mscclpp::allocSharedCuda(); + auto p2 = mscclpp::allocSharedCuda(5); } TEST(CudaMemoryTest, Unique) { - auto p1 = mscclpp::makeUniqueCuda(); - auto p2 = mscclpp::makeUniqueCuda(5); + auto p1 = mscclpp::allocUniqueCuda(); + auto p2 = mscclpp::allocUniqueCuda(5); }