mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-24 14:54:51 +00:00
Remove alloc.h and beef up cuda_utils.hpp (#82)
This commit is contained in:
@@ -10,6 +10,8 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
|
||||
// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
|
||||
struct AvoidCudaGraphCaptureGuard {
|
||||
AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
|
||||
MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
|
||||
@@ -18,6 +20,7 @@ struct AvoidCudaGraphCaptureGuard {
|
||||
cudaStreamCaptureMode mode_;
|
||||
};
|
||||
|
||||
// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
|
||||
struct CudaStreamWithFlags {
|
||||
CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); }
|
||||
~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
|
||||
@@ -48,7 +51,7 @@ T* cudaHostCalloc(size_t nelem) {
|
||||
}
|
||||
|
||||
template <class T, T*(alloc)(size_t), class Deleter, class Memory>
|
||||
Memory safeMake(size_t nelem) {
|
||||
Memory safeAlloc(size_t nelem) {
|
||||
T* ptr = nullptr;
|
||||
try {
|
||||
ptr = alloc(nelem);
|
||||
@@ -63,46 +66,98 @@ Memory safeMake(size_t nelem) {
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// A deleter that calls cudaFree for use with std::unique_ptr/std::shared_ptr.
|
||||
template <class T>
|
||||
struct CudaDeleter {
|
||||
void operator()(T* ptr) {
|
||||
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
|
||||
void operator()(TPtrOrArray ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFree(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
// A deleter that calls cudaFreeHost for use with std::unique_ptr/std::shared_ptr.
|
||||
template <class T>
|
||||
struct CudaHostDeleter {
|
||||
void operator()(T* ptr) {
|
||||
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
|
||||
void operator()(TPtrOrArray ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
|
||||
template <class T>
|
||||
std::shared_ptr<T> makeSharedCuda(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
|
||||
|
||||
// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
|
||||
template <class T>
|
||||
UniqueCudaPtr<T> makeUniqueCuda(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
}
|
||||
|
||||
// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
|
||||
template <class T, typename... Args>
|
||||
std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
|
||||
new (ptr.get()) T(std::forward<Args>(args)...);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
|
||||
// std::shared_ptr to it.
|
||||
template <class T>
|
||||
std::shared_ptr<T> makeSharedCudaHost(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(count);
|
||||
std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
|
||||
using TElem = std::remove_extent_t<T>;
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
new (&ptr[i]) TElem();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
|
||||
|
||||
// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
|
||||
template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
|
||||
new (ptr.get()) T(std::forward<Args>(args)...);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
|
||||
// std::unique_ptr to it.
|
||||
template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
|
||||
using TElem = std::remove_extent_t<T>;
|
||||
auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
new (&ptr[i]) TElem();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Asynchronous cudaMemcpy without capture into a CUDA graph.
|
||||
template <class T>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
|
||||
void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
|
||||
}
|
||||
|
||||
// Synchronous cudaMemcpy without capture into a CUDA graph.
|
||||
template <class T>
|
||||
void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
Reference in New Issue
Block a user