Remove alloc.h and beef up cuda_utils.hpp (#82)

This commit is contained in:
Olli Saarikivi
2023-05-24 01:34:18 -07:00
committed by GitHub
parent 216373eab2
commit 457c422791
16 changed files with 129 additions and 260 deletions

View File

@@ -10,6 +10,8 @@
namespace mscclpp {
// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
struct AvoidCudaGraphCaptureGuard {
AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
@@ -18,6 +20,7 @@ struct AvoidCudaGraphCaptureGuard {
cudaStreamCaptureMode mode_;
};
// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
struct CudaStreamWithFlags {
CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); }
~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
@@ -48,7 +51,7 @@ T* cudaHostCalloc(size_t nelem) {
}
template <class T, T*(alloc)(size_t), class Deleter, class Memory>
Memory safeMake(size_t nelem) {
Memory safeAlloc(size_t nelem) {
T* ptr = nullptr;
try {
ptr = alloc(nelem);
@@ -63,46 +66,98 @@ Memory safeMake(size_t nelem) {
} // namespace detail
// A deleter that calls cudaFree for use with std::unique_ptr/std::shared_ptr.
template <class T>
struct CudaDeleter {
void operator()(T* ptr) {
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
void operator()(TPtrOrArray ptr) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW(cudaFree(ptr));
}
};
// A deleter that calls cudaFreeHost for use with std::unique_ptr/std::shared_ptr.
template <class T>
struct CudaHostDeleter {
void operator()(T* ptr) {
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
void operator()(TPtrOrArray ptr) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
}
};
// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
template <class T>
std::shared_ptr<T> makeSharedCuda(size_t count = 1) {
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
}
template <class T>
using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
template <class T>
UniqueCudaPtr<T> makeUniqueCuda(size_t count = 1) {
return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
}
// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
template <class T, typename... Args>
std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
new (ptr.get()) T(std::forward<Args>(args)...);
return ptr;
}
// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
// std::shared_ptr to it.
template <class T>
std::shared_ptr<T> makeSharedCudaHost(size_t count = 1) {
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(count);
std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
using TElem = std::remove_extent_t<T>;
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
for (size_t i = 0; i < count; ++i) {
new (&ptr[i]) TElem();
}
return ptr;
}
template <class T>
using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
new (ptr.get()) T(std::forward<Args>(args)...);
return ptr;
}
// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
// std::unique_ptr to it.
template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
using TElem = std::remove_extent_t<T>;
auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
for (size_t i = 0; i < count; ++i) {
new (&ptr[i]) TElem();
}
return ptr;
}
// Asynchronous cudaMemcpy without capture into a CUDA graph.
template <class T>
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count = 1) {
return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
}
// Synchronous cudaMemcpy without capture into a CUDA graph.
template <class T>
void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
AvoidCudaGraphCaptureGuard cgcGuard;
CudaStreamWithFlags stream(cudaStreamNonBlocking);
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
}
} // namespace mscclpp

View File

@@ -11,6 +11,7 @@
#include <vector>
#include "api.h"
#include "checks.h"
#include "checks_internal.hpp"
#include "socket.h"
#include "utils.h"

View File

@@ -6,12 +6,16 @@
#include "socket.h"
#include <errno.h>
#include <ifaddrs.h>
#include <net/if.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "checks.h"
#include "config.h"
#include "debug.h"
#include "utils.h"
static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset,

View File

@@ -5,7 +5,7 @@
namespace mscclpp {
MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr<Connection> connection)
: BaseEpoch(connection, makeUniqueCuda<EpochIds>(), makeUniqueCuda<uint64_t>()) {
: BaseEpoch(connection, allocUniqueCuda<EpochIds>(), allocUniqueCuda<uint64_t>()) {
setup(communicator);
}

View File

@@ -11,7 +11,7 @@
namespace mscclpp {
struct HostProxyFifo::Impl {
UniqueCudaHostPtr<ProxyTrigger> triggers;
UniqueCudaHostPtr<ProxyTrigger[]> triggers;
UniqueCudaPtr<uint64_t> head;
UniqueCudaPtr<uint64_t> tailReplica;
@@ -28,9 +28,9 @@ struct HostProxyFifo::Impl {
CudaStreamWithFlags stream;
Impl()
: triggers(makeUniqueCudaHost<ProxyTrigger>(MSCCLPP_PROXY_FIFO_SIZE)),
head(makeUniqueCuda<uint64_t>(1)),
tailReplica(makeUniqueCuda<uint64_t>(1)),
: triggers(makeUniqueCudaHost<ProxyTrigger[]>(MSCCLPP_PROXY_FIFO_SIZE)),
head(allocUniqueCuda<uint64_t>()),
tailReplica(allocUniqueCuda<uint64_t>()),
hostTail(0),
stream(cudaStreamNonBlocking) {}
};

View File

@@ -1,182 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef MSCCLPP_ALLOC_H_
#define MSCCLPP_ALLOC_H_
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
#include "align.h"
#include "checks.h"
#include "mscclpp.h"
#include "utils.h"
template <typename T>
mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
mscclppResult_t result = mscclppSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined), result,
finish);
memset(*ptr, 0, nelem * sizeof(T));
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T));
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
#define mscclppCudaHostCalloc(...) mscclppCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
inline mscclppResult_t mscclppCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
return mscclppSuccess;
}
template <typename T>
mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
void* p = malloc(nelem * sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
return mscclppSystemError;
}
INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p);
memset(p, 0, nelem * sizeof(T));
*ptr = (T*)p;
return mscclppSuccess;
}
#define mscclppCalloc(...) mscclppCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) {
if (nelem < oldNelem) return mscclppInternalError;
if (nelem == oldNelem) return mscclppSuccess;
T* oldp = *ptr;
T* p = (T*)malloc(nelem * sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
return mscclppSystemError;
}
memcpy(p, oldp, oldNelem * sizeof(T));
free(oldp);
memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T));
*ptr = (T*)p;
INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T),
*ptr);
return mscclppSuccess;
}
template <typename T>
mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
mscclppResult_t result = mscclppSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T));
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
#define mscclppCudaMalloc(...) mscclppCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
mscclppResult_t result = mscclppSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T));
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
#define mscclppCudaCalloc(...) mscclppCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc,
int line) {
mscclppResult_t result = mscclppSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T));
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
#define mscclppCudaCallocAsync(...) mscclppCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) {
mscclppResult_t result = mscclppSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
cudaStream_t stream;
CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
MSCCLPPCHECKGOTO(mscclppCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
mscclppResult_t result = mscclppSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem * sizeof(T), cudaMemcpyDefault, stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
mscclppResult_t mscclppCudaFree(T* ptr) {
mscclppResult_t result = mscclppSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaFree(ptr), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
}
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
// allocated on separate pages as those pages will be marked DONTFORK
// and if they are shared, that could cause a crash in a child process
inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) {
size_t page_size = sysconf(_SC_PAGESIZE);
void* p;
int size_aligned = ROUNDUP(size, page_size);
int ret = posix_memalign(&p, page_size, size_aligned);
if (ret != 0) return mscclppSystemError;
memset(p, 0, size);
*ptr = p;
INFO(MSCCLPP_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
return mscclppSuccess;
}
#define mscclppIbMalloc(...) mscclppIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
#endif

View File

@@ -8,11 +8,11 @@
#define MSCCLPP_UTILS_H_
#include <stdint.h>
#include <stdio.h>
#include <chrono>
#include "alloc.h"
// #include "mscclpp.h"
#include "mscclpp.h"
// int mscclppCudaCompCap();

View File

@@ -6,49 +6,43 @@
#include <chrono>
#include <fstream>
#include "alloc.h"
uint64_t NpKit::rank_ = 0;
NpKitEvent** NpKit::gpu_event_buffers_ = nullptr;
NpKitEvent** NpKit::cpu_event_buffers_ = nullptr;
std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
std::vector<std::unique_ptr<NpKitEvent[]>> NpKit::cpu_event_buffers_;
NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr;
NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr;
mscclpp::UniqueCudaPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
std::unique_ptr<NpKitEventCollectContext[]> NpKit::cpu_collect_contexts_;
uint64_t NpKit::cpu_base_system_timestamp_ = 0;
uint64_t NpKit::cpu_base_steady_timestamp_ = 0;
mscclppResult_t NpKit::Init(int rank) {
void NpKit::Init(int rank) {
uint64_t i = 0;
NpKitEventCollectContext ctx;
ctx.event_buffer_head = 0;
rank_ = rank;
// Init event data structures
MSCCLPPCHECK(mscclppCalloc(&gpu_event_buffers_, kNumGpuEventBuffers));
MSCCLPPCHECK(mscclppCudaCalloc(&gpu_collect_contexts_, kNumGpuEventBuffers));
gpu_collect_contexts_ = mscclpp::allocUniqueCuda<NpKitEventCollectContext>(kNumGpuEventBuffers);
for (i = 0; i < kNumGpuEventBuffers; i++) {
MSCCLPPCHECK(mscclppCudaCalloc(gpu_event_buffers_ + i, kMaxNumGpuEventsPerBuffer));
ctx.event_buffer = gpu_event_buffers_[i];
MSCCLPPCHECK(mscclppCudaMemcpy(gpu_collect_contexts_ + i, &ctx, 1));
gpu_event_buffers_.emplace_back(mscclpp::allocUniqueCuda<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
ctx.event_buffer = gpu_event_buffers_[i].get();
mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1);
}
MSCCLPPCHECK(mscclppCalloc(&cpu_event_buffers_, kNumCpuEventBuffers));
MSCCLPPCHECK(mscclppCalloc(&cpu_collect_contexts_, kNumCpuEventBuffers));
cpu_collect_contexts_ = std::make_unique<NpKitEventCollectContext[]>(kNumCpuEventBuffers);
for (i = 0; i < kNumCpuEventBuffers; i++) {
MSCCLPPCHECK(mscclppCalloc(cpu_event_buffers_ + i, kMaxNumCpuEventsPerBuffer));
ctx.event_buffer = cpu_event_buffers_[i];
cpu_event_buffers_.emplace_back(std::make_unique<NpKitEvent[]>(kMaxNumCpuEventsPerBuffer));
ctx.event_buffer = cpu_event_buffers_[i].get();
cpu_collect_contexts_[i] = ctx;
}
// Init timestamp
cpu_base_system_timestamp_ = std::chrono::system_clock::now().time_since_epoch().count();
cpu_base_steady_timestamp_ = std::chrono::steady_clock::now().time_since_epoch().count();
return mscclppSuccess;
}
mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
void NpKit::Dump(const std::string& dump_dir) {
uint64_t i = 0;
std::string dump_file_path;
@@ -60,7 +54,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
dump_file_path += "_channel_";
dump_file_path += std::to_string(i);
auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i]),
cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i].get()),
cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
cpu_trace_file.close();
}
@@ -89,10 +83,10 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
dump_file_path += std::to_string(rank_);
dump_file_path += "_buf_";
dump_file_path += std::to_string(i);
MSCCLPPCHECK(mscclppCudaMemcpy(cpu_event_buffers_[0], gpu_event_buffers_[i], kMaxNumGpuEventsPerBuffer));
MSCCLPPCHECK(mscclppCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1));
mscclpp::memcpyCuda(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
mscclpp::memcpyCuda(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0]),
gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0].get()),
cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
gpu_trace_file.close();
}
@@ -103,37 +97,27 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
dump_file_path += std::to_string(rank_);
cudaDeviceProp dev_prop;
int dev;
CUDACHECK(cudaGetDevice(&dev));
CUDACHECK(cudaGetDeviceProperties(&dev_prop, dev));
MSCCLPP_CUDATHROW(cudaGetDevice(&dev));
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev));
std::string clock_rate_str = std::to_string(dev_prop.clockRate);
auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
gpu_clock_rate_file.close();
return mscclppSuccess;
}
mscclppResult_t NpKit::Shutdown() {
void NpKit::Shutdown() {
uint64_t i = 0;
// Free CPU event data structures
for (i = 0; i < kNumCpuEventBuffers; i++) {
free(cpu_event_buffers_[i]);
}
free(cpu_event_buffers_);
free(cpu_collect_contexts_);
cpu_event_buffers_.clear();
cpu_collect_contexts_.reset();
// Free GPU event data structures
for (i = 0; i < kNumGpuEventBuffers; i++) {
CUDACHECK(cudaFree(gpu_event_buffers_[i]));
}
free(gpu_event_buffers_);
CUDACHECK(cudaFree(gpu_collect_contexts_));
return mscclppSuccess;
gpu_event_buffers_.clear();
gpu_collect_contexts_.reset();
}
NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_; }
NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_.get(); }
void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) {
uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head;

View File

@@ -1,9 +1,10 @@
#ifndef NPKIT_H_
#define NPKIT_H_
#include <mscclpp/cuda_utils.hpp>
#include <string>
#include <vector>
#include "mscclpp.h"
#include "npkit_event.h"
#include "npkit_struct.h"
@@ -13,11 +14,11 @@ class NpKit {
static const uint64_t kNumCpuEventBuffers = 32;
static mscclppResult_t Init(int rank);
static void Init(int rank);
static mscclppResult_t Dump(const std::string& dump_dir);
static void Dump(const std::string& dump_dir);
static mscclppResult_t Shutdown();
static void Shutdown();
static NpKitEventCollectContext* GetGpuEventCollectContexts();
@@ -47,11 +48,11 @@ class NpKit {
// 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
static NpKitEvent** gpu_event_buffers_;
static NpKitEvent** cpu_event_buffers_;
static std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> gpu_event_buffers_;
static std::vector<std::unique_ptr<NpKitEvent[]>> cpu_event_buffers_;
static NpKitEventCollectContext* gpu_collect_contexts_;
static NpKitEventCollectContext* cpu_collect_contexts_;
static mscclpp::UniqueCudaPtr<NpKitEventCollectContext> gpu_collect_contexts_;
static std::unique_ptr<NpKitEventCollectContext[]> cpu_collect_contexts_;
static uint64_t cpu_base_system_timestamp_;
static uint64_t cpu_base_steady_timestamp_;

View File

@@ -6,6 +6,7 @@
#include "api.h"
#include "checks_internal.hpp"
#include "debug.h"
#include "utils.h"
namespace mscclpp {

View File

@@ -6,12 +6,16 @@
#include "utils.h"
#include <cuda_runtime.h>
#include <numa.h>
#include <stdlib.h>
#include <unistd.h>
#include <memory>
#include <string>
#include "checks.h"
// Get current Compute Capability
// int mscclppCudaCompCap() {
// int cudaDev;

View File

@@ -2,6 +2,7 @@
#include <mscclpp/fifo.hpp>
#include <mscclpp/proxy.hpp>
#include <mscclpp/epoch.hpp>
#include <mscclpp/checks.hpp>
#include <utils.h>
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
@@ -392,9 +393,9 @@ int main(int argc, char* argv[])
printf("Stopping MSCCL++ proxy threads\n");
proxyService.stop();
CUDACHECK(cudaFree(data_d));
CUDACHECK(cudaFree(deviceHandles1));
CUDACHECK(cudaFree(deviceHandles2));
MSCCLPP_CUDATHROW(cudaFree(data_d));
MSCCLPP_CUDATHROW(cudaFree(deviceHandles1));
MSCCLPP_CUDATHROW(cudaFree(deviceHandles2));
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
MPI_Finalize();

View File

@@ -34,7 +34,7 @@ int main(int argc, const char* argv[])
CUDACHECK(cudaSetDevice(cudaDevId));
int nelem = 1;
auto data = mscclpp::makeUniqueCuda<int>(nelem);
auto data = mscclpp::allocUniqueCuda<int>(nelem);
std::shared_ptr<mscclpp::Bootstrap> bootstrap(new mscclpp::Bootstrap(isSend, 2));
bootstrap->initialize(ipPortPair);

View File

@@ -200,7 +200,7 @@ class AllGatherTestEngine : public BaseTestEngine {
};
void AllGatherTestEngine::allocateBuffer() {
sendBuff_ = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
}

View File

@@ -119,8 +119,8 @@ class SendRecvTestEngine : public BaseTestEngine {
};
void SendRecvTestEngine::allocateBuffer() {
std::shared_ptr<int> sendBuff = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
std::shared_ptr<int> recvBuff = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
std::shared_ptr<int> sendBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
std::shared_ptr<int> recvBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
devicePtrs_.push_back(sendBuff);
devicePtrs_.push_back(recvBuff);

View File

@@ -2,11 +2,11 @@
#include <mscclpp/cuda_utils.hpp>
TEST(CudaMemoryTest, Shared) {
auto p1 = mscclpp::makeSharedCuda<uint32_t>();
auto p2 = mscclpp::makeSharedCuda<int64_t>(5);
auto p1 = mscclpp::allocSharedCuda<uint32_t>();
auto p2 = mscclpp::allocSharedCuda<int64_t>(5);
}
TEST(CudaMemoryTest, Unique) {
auto p1 = mscclpp::makeUniqueCuda<uint32_t>();
auto p2 = mscclpp::makeUniqueCuda<int64_t>(5);
auto p1 = mscclpp::allocUniqueCuda<uint32_t>();
auto p2 = mscclpp::allocUniqueCuda<int64_t>(5);
}