mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
Remove alloc.h and beef up cuda_utils.hpp (#82)
This commit is contained in:
@@ -10,6 +10,8 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
|
||||
// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
|
||||
struct AvoidCudaGraphCaptureGuard {
|
||||
AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
|
||||
MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
|
||||
@@ -18,6 +20,7 @@ struct AvoidCudaGraphCaptureGuard {
|
||||
cudaStreamCaptureMode mode_;
|
||||
};
|
||||
|
||||
// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
|
||||
struct CudaStreamWithFlags {
|
||||
CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); }
|
||||
~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
|
||||
@@ -48,7 +51,7 @@ T* cudaHostCalloc(size_t nelem) {
|
||||
}
|
||||
|
||||
template <class T, T*(alloc)(size_t), class Deleter, class Memory>
|
||||
Memory safeMake(size_t nelem) {
|
||||
Memory safeAlloc(size_t nelem) {
|
||||
T* ptr = nullptr;
|
||||
try {
|
||||
ptr = alloc(nelem);
|
||||
@@ -63,46 +66,98 @@ Memory safeMake(size_t nelem) {
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// A deleter that calls cudaFree for use with std::unique_ptr/std::shared_ptr.
|
||||
template <class T>
|
||||
struct CudaDeleter {
|
||||
void operator()(T* ptr) {
|
||||
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
|
||||
void operator()(TPtrOrArray ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFree(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
// A deleter that calls cudaFreeHost for use with std::unique_ptr/std::shared_ptr.
|
||||
template <class T>
|
||||
struct CudaHostDeleter {
|
||||
void operator()(T* ptr) {
|
||||
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
|
||||
void operator()(TPtrOrArray ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
|
||||
template <class T>
|
||||
std::shared_ptr<T> makeSharedCuda(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
|
||||
|
||||
// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
|
||||
template <class T>
|
||||
UniqueCudaPtr<T> makeUniqueCuda(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
}
|
||||
|
||||
// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
|
||||
template <class T, typename... Args>
|
||||
std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
|
||||
new (ptr.get()) T(std::forward<Args>(args)...);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
|
||||
// std::shared_ptr to it.
|
||||
template <class T>
|
||||
std::shared_ptr<T> makeSharedCudaHost(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(count);
|
||||
std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
|
||||
using TElem = std::remove_extent_t<T>;
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
new (&ptr[i]) TElem();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
|
||||
|
||||
// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
|
||||
template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
|
||||
new (ptr.get()) T(std::forward<Args>(args)...);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
|
||||
// std::unique_ptr to it.
|
||||
template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
|
||||
using TElem = std::remove_extent_t<T>;
|
||||
auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
new (&ptr[i]) TElem();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Asynchronous cudaMemcpy without capture into a CUDA graph.
|
||||
template <class T>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count = 1) {
|
||||
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
|
||||
void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
|
||||
}
|
||||
|
||||
// Synchronous cudaMemcpy without capture into a CUDA graph.
|
||||
template <class T>
|
||||
void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "api.h"
|
||||
#include "checks.h"
|
||||
#include "checks_internal.hpp"
|
||||
#include "socket.h"
|
||||
#include "utils.h"
|
||||
|
||||
@@ -6,12 +6,16 @@
|
||||
|
||||
#include "socket.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <ifaddrs.h>
|
||||
#include <net/if.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "checks.h"
|
||||
#include "config.h"
|
||||
#include "debug.h"
|
||||
#include "utils.h"
|
||||
|
||||
static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
namespace mscclpp {
|
||||
|
||||
MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr<Connection> connection)
|
||||
: BaseEpoch(connection, makeUniqueCuda<EpochIds>(), makeUniqueCuda<uint64_t>()) {
|
||||
: BaseEpoch(connection, allocUniqueCuda<EpochIds>(), allocUniqueCuda<uint64_t>()) {
|
||||
setup(communicator);
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
namespace mscclpp {
|
||||
|
||||
struct HostProxyFifo::Impl {
|
||||
UniqueCudaHostPtr<ProxyTrigger> triggers;
|
||||
UniqueCudaHostPtr<ProxyTrigger[]> triggers;
|
||||
UniqueCudaPtr<uint64_t> head;
|
||||
UniqueCudaPtr<uint64_t> tailReplica;
|
||||
|
||||
@@ -28,9 +28,9 @@ struct HostProxyFifo::Impl {
|
||||
CudaStreamWithFlags stream;
|
||||
|
||||
Impl()
|
||||
: triggers(makeUniqueCudaHost<ProxyTrigger>(MSCCLPP_PROXY_FIFO_SIZE)),
|
||||
head(makeUniqueCuda<uint64_t>(1)),
|
||||
tailReplica(makeUniqueCuda<uint64_t>(1)),
|
||||
: triggers(makeUniqueCudaHost<ProxyTrigger[]>(MSCCLPP_PROXY_FIFO_SIZE)),
|
||||
head(allocUniqueCuda<uint64_t>()),
|
||||
tailReplica(allocUniqueCuda<uint64_t>()),
|
||||
hostTail(0),
|
||||
stream(cudaStreamNonBlocking) {}
|
||||
};
|
||||
|
||||
@@ -1,182 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef MSCCLPP_ALLOC_H_
|
||||
#define MSCCLPP_ALLOC_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "align.h"
|
||||
#include "checks.h"
|
||||
#include "mscclpp.h"
|
||||
#include "utils.h"
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined), result,
|
||||
finish);
|
||||
memset(*ptr, 0, nelem * sizeof(T));
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaHostCalloc(...) mscclppCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
inline mscclppResult_t mscclppCudaHostFree(void* ptr) {
|
||||
CUDACHECK(cudaFreeHost(ptr));
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
|
||||
void* p = malloc(nelem * sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
|
||||
return mscclppSystemError;
|
||||
}
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p);
|
||||
memset(p, 0, nelem * sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
return mscclppSuccess;
|
||||
}
|
||||
#define mscclppCalloc(...) mscclppCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
||||
if (nelem < oldNelem) return mscclppInternalError;
|
||||
if (nelem == oldNelem) return mscclppSuccess;
|
||||
|
||||
T* oldp = *ptr;
|
||||
T* p = (T*)malloc(nelem * sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
|
||||
return mscclppSystemError;
|
||||
}
|
||||
memcpy(p, oldp, oldNelem * sizeof(T));
|
||||
free(oldp);
|
||||
memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T),
|
||||
*ptr);
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaMalloc(...) mscclppCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaCalloc(...) mscclppCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc,
|
||||
int line) {
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaCallocAsync(...) mscclppCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream;
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
|
||||
MSCCLPPCHECKGOTO(mscclppCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem * sizeof(T), cudaMemcpyDefault, stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaFree(T* ptr) {
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaFree(ptr), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
|
||||
// allocated on separate pages as those pages will be marked DONTFORK
|
||||
// and if they are shared, that could cause a crash in a child process
|
||||
inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) {
|
||||
size_t page_size = sysconf(_SC_PAGESIZE);
|
||||
void* p;
|
||||
int size_aligned = ROUNDUP(size, page_size);
|
||||
int ret = posix_memalign(&p, page_size, size_aligned);
|
||||
if (ret != 0) return mscclppSystemError;
|
||||
memset(p, 0, size);
|
||||
*ptr = p;
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
|
||||
return mscclppSuccess;
|
||||
}
|
||||
#define mscclppIbMalloc(...) mscclppIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
#endif
|
||||
@@ -8,11 +8,11 @@
|
||||
#define MSCCLPP_UTILS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <chrono>
|
||||
|
||||
#include "alloc.h"
|
||||
// #include "mscclpp.h"
|
||||
#include "mscclpp.h"
|
||||
|
||||
// int mscclppCudaCompCap();
|
||||
|
||||
|
||||
@@ -6,49 +6,43 @@
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
|
||||
#include "alloc.h"
|
||||
|
||||
uint64_t NpKit::rank_ = 0;
|
||||
|
||||
NpKitEvent** NpKit::gpu_event_buffers_ = nullptr;
|
||||
NpKitEvent** NpKit::cpu_event_buffers_ = nullptr;
|
||||
std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
|
||||
std::vector<std::unique_ptr<NpKitEvent[]>> NpKit::cpu_event_buffers_;
|
||||
|
||||
NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr;
|
||||
NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr;
|
||||
mscclpp::UniqueCudaPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
|
||||
std::unique_ptr<NpKitEventCollectContext[]> NpKit::cpu_collect_contexts_;
|
||||
uint64_t NpKit::cpu_base_system_timestamp_ = 0;
|
||||
uint64_t NpKit::cpu_base_steady_timestamp_ = 0;
|
||||
|
||||
mscclppResult_t NpKit::Init(int rank) {
|
||||
void NpKit::Init(int rank) {
|
||||
uint64_t i = 0;
|
||||
NpKitEventCollectContext ctx;
|
||||
ctx.event_buffer_head = 0;
|
||||
rank_ = rank;
|
||||
|
||||
// Init event data structures
|
||||
MSCCLPPCHECK(mscclppCalloc(&gpu_event_buffers_, kNumGpuEventBuffers));
|
||||
MSCCLPPCHECK(mscclppCudaCalloc(&gpu_collect_contexts_, kNumGpuEventBuffers));
|
||||
gpu_collect_contexts_ = mscclpp::allocUniqueCuda<NpKitEventCollectContext>(kNumGpuEventBuffers);
|
||||
for (i = 0; i < kNumGpuEventBuffers; i++) {
|
||||
MSCCLPPCHECK(mscclppCudaCalloc(gpu_event_buffers_ + i, kMaxNumGpuEventsPerBuffer));
|
||||
ctx.event_buffer = gpu_event_buffers_[i];
|
||||
MSCCLPPCHECK(mscclppCudaMemcpy(gpu_collect_contexts_ + i, &ctx, 1));
|
||||
gpu_event_buffers_.emplace_back(mscclpp::allocUniqueCuda<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
|
||||
ctx.event_buffer = gpu_event_buffers_[i].get();
|
||||
mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1);
|
||||
}
|
||||
|
||||
MSCCLPPCHECK(mscclppCalloc(&cpu_event_buffers_, kNumCpuEventBuffers));
|
||||
MSCCLPPCHECK(mscclppCalloc(&cpu_collect_contexts_, kNumCpuEventBuffers));
|
||||
cpu_collect_contexts_ = std::make_unique<NpKitEventCollectContext[]>(kNumCpuEventBuffers);
|
||||
for (i = 0; i < kNumCpuEventBuffers; i++) {
|
||||
MSCCLPPCHECK(mscclppCalloc(cpu_event_buffers_ + i, kMaxNumCpuEventsPerBuffer));
|
||||
ctx.event_buffer = cpu_event_buffers_[i];
|
||||
cpu_event_buffers_.emplace_back(std::make_unique<NpKitEvent[]>(kMaxNumCpuEventsPerBuffer));
|
||||
ctx.event_buffer = cpu_event_buffers_[i].get();
|
||||
cpu_collect_contexts_[i] = ctx;
|
||||
}
|
||||
|
||||
// Init timestamp
|
||||
cpu_base_system_timestamp_ = std::chrono::system_clock::now().time_since_epoch().count();
|
||||
cpu_base_steady_timestamp_ = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
void NpKit::Dump(const std::string& dump_dir) {
|
||||
uint64_t i = 0;
|
||||
std::string dump_file_path;
|
||||
|
||||
@@ -60,7 +54,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
dump_file_path += "_channel_";
|
||||
dump_file_path += std::to_string(i);
|
||||
auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
|
||||
cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i]),
|
||||
cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i].get()),
|
||||
cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
|
||||
cpu_trace_file.close();
|
||||
}
|
||||
@@ -89,10 +83,10 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
dump_file_path += std::to_string(rank_);
|
||||
dump_file_path += "_buf_";
|
||||
dump_file_path += std::to_string(i);
|
||||
MSCCLPPCHECK(mscclppCudaMemcpy(cpu_event_buffers_[0], gpu_event_buffers_[i], kMaxNumGpuEventsPerBuffer));
|
||||
MSCCLPPCHECK(mscclppCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1));
|
||||
mscclpp::memcpyCuda(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
|
||||
mscclpp::memcpyCuda(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
|
||||
auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
|
||||
gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0]),
|
||||
gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0].get()),
|
||||
cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
|
||||
gpu_trace_file.close();
|
||||
}
|
||||
@@ -103,37 +97,27 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
dump_file_path += std::to_string(rank_);
|
||||
cudaDeviceProp dev_prop;
|
||||
int dev;
|
||||
CUDACHECK(cudaGetDevice(&dev));
|
||||
CUDACHECK(cudaGetDeviceProperties(&dev_prop, dev));
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&dev));
|
||||
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev));
|
||||
std::string clock_rate_str = std::to_string(dev_prop.clockRate);
|
||||
auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
|
||||
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
|
||||
gpu_clock_rate_file.close();
|
||||
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t NpKit::Shutdown() {
|
||||
void NpKit::Shutdown() {
|
||||
uint64_t i = 0;
|
||||
|
||||
// Free CPU event data structures
|
||||
for (i = 0; i < kNumCpuEventBuffers; i++) {
|
||||
free(cpu_event_buffers_[i]);
|
||||
}
|
||||
free(cpu_event_buffers_);
|
||||
free(cpu_collect_contexts_);
|
||||
cpu_event_buffers_.clear();
|
||||
cpu_collect_contexts_.reset();
|
||||
|
||||
// Free GPU event data structures
|
||||
for (i = 0; i < kNumGpuEventBuffers; i++) {
|
||||
CUDACHECK(cudaFree(gpu_event_buffers_[i]));
|
||||
}
|
||||
free(gpu_event_buffers_);
|
||||
CUDACHECK(cudaFree(gpu_collect_contexts_));
|
||||
|
||||
return mscclppSuccess;
|
||||
gpu_event_buffers_.clear();
|
||||
gpu_collect_contexts_.reset();
|
||||
}
|
||||
|
||||
NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_; }
|
||||
NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_.get(); }
|
||||
|
||||
void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) {
|
||||
uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head;
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#ifndef NPKIT_H_
|
||||
#define NPKIT_H_
|
||||
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include "npkit_event.h"
|
||||
#include "npkit_struct.h"
|
||||
|
||||
@@ -13,11 +14,11 @@ class NpKit {
|
||||
|
||||
static const uint64_t kNumCpuEventBuffers = 32;
|
||||
|
||||
static mscclppResult_t Init(int rank);
|
||||
static void Init(int rank);
|
||||
|
||||
static mscclppResult_t Dump(const std::string& dump_dir);
|
||||
static void Dump(const std::string& dump_dir);
|
||||
|
||||
static mscclppResult_t Shutdown();
|
||||
static void Shutdown();
|
||||
|
||||
static NpKitEventCollectContext* GetGpuEventCollectContexts();
|
||||
|
||||
@@ -47,11 +48,11 @@ class NpKit {
|
||||
// 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
|
||||
static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
|
||||
|
||||
static NpKitEvent** gpu_event_buffers_;
|
||||
static NpKitEvent** cpu_event_buffers_;
|
||||
static std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> gpu_event_buffers_;
|
||||
static std::vector<std::unique_ptr<NpKitEvent[]>> cpu_event_buffers_;
|
||||
|
||||
static NpKitEventCollectContext* gpu_collect_contexts_;
|
||||
static NpKitEventCollectContext* cpu_collect_contexts_;
|
||||
static mscclpp::UniqueCudaPtr<NpKitEventCollectContext> gpu_collect_contexts_;
|
||||
static std::unique_ptr<NpKitEventCollectContext[]> cpu_collect_contexts_;
|
||||
|
||||
static uint64_t cpu_base_system_timestamp_;
|
||||
static uint64_t cpu_base_steady_timestamp_;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#include "api.h"
|
||||
#include "checks_internal.hpp"
|
||||
#include "debug.h"
|
||||
#include "utils.h"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
@@ -6,12 +6,16 @@
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <numa.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "checks.h"
|
||||
|
||||
// Get current Compute Capability
|
||||
// int mscclppCudaCompCap() {
|
||||
// int cudaDev;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <mscclpp/fifo.hpp>
|
||||
#include <mscclpp/proxy.hpp>
|
||||
#include <mscclpp/epoch.hpp>
|
||||
#include <mscclpp/checks.hpp>
|
||||
#include <utils.h>
|
||||
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
@@ -392,9 +393,9 @@ int main(int argc, char* argv[])
|
||||
printf("Stopping MSCCL++ proxy threads\n");
|
||||
proxyService.stop();
|
||||
|
||||
CUDACHECK(cudaFree(data_d));
|
||||
CUDACHECK(cudaFree(deviceHandles1));
|
||||
CUDACHECK(cudaFree(deviceHandles2));
|
||||
MSCCLPP_CUDATHROW(cudaFree(data_d));
|
||||
MSCCLPP_CUDATHROW(cudaFree(deviceHandles1));
|
||||
MSCCLPP_CUDATHROW(cudaFree(deviceHandles2));
|
||||
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
MPI_Finalize();
|
||||
|
||||
@@ -34,7 +34,7 @@ int main(int argc, const char* argv[])
|
||||
CUDACHECK(cudaSetDevice(cudaDevId));
|
||||
|
||||
int nelem = 1;
|
||||
auto data = mscclpp::makeUniqueCuda<int>(nelem);
|
||||
auto data = mscclpp::allocUniqueCuda<int>(nelem);
|
||||
|
||||
std::shared_ptr<mscclpp::Bootstrap> bootstrap(new mscclpp::Bootstrap(isSend, 2));
|
||||
bootstrap->initialize(ipPortPair);
|
||||
|
||||
@@ -200,7 +200,7 @@ class AllGatherTestEngine : public BaseTestEngine {
|
||||
};
|
||||
|
||||
void AllGatherTestEngine::allocateBuffer() {
|
||||
sendBuff_ = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
|
||||
}
|
||||
|
||||
|
||||
@@ -119,8 +119,8 @@ class SendRecvTestEngine : public BaseTestEngine {
|
||||
};
|
||||
|
||||
void SendRecvTestEngine::allocateBuffer() {
|
||||
std::shared_ptr<int> sendBuff = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
std::shared_ptr<int> recvBuff = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
std::shared_ptr<int> sendBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
std::shared_ptr<int> recvBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
devicePtrs_.push_back(sendBuff);
|
||||
devicePtrs_.push_back(recvBuff);
|
||||
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
|
||||
TEST(CudaMemoryTest, Shared) {
|
||||
auto p1 = mscclpp::makeSharedCuda<uint32_t>();
|
||||
auto p2 = mscclpp::makeSharedCuda<int64_t>(5);
|
||||
auto p1 = mscclpp::allocSharedCuda<uint32_t>();
|
||||
auto p2 = mscclpp::allocSharedCuda<int64_t>(5);
|
||||
}
|
||||
|
||||
TEST(CudaMemoryTest, Unique) {
|
||||
auto p1 = mscclpp::makeUniqueCuda<uint32_t>();
|
||||
auto p2 = mscclpp::makeUniqueCuda<int64_t>(5);
|
||||
auto p1 = mscclpp::allocUniqueCuda<uint32_t>();
|
||||
auto p2 = mscclpp::allocUniqueCuda<int64_t>(5);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user