mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-06-08 15:30:41 +00:00
Add GpuBuffer class (#423)
* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
This commit is contained in:
@@ -35,7 +35,7 @@ jobs:
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
template <bool IsOutOfPlace>
|
||||
__global__ void __launch_bounds__(1024, 1)
|
||||
broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
|
||||
size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
|
||||
[[maybe_unused]] size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
|
||||
size_t nRanksPerNode, size_t nelemsPerGPU) {
|
||||
const size_t nThread = blockDim.x * gridDim.x;
|
||||
const size_t nPeer = nRanksPerNode - 1;
|
||||
|
||||
@@ -176,9 +176,9 @@ static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannel
|
||||
std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles),
|
||||
[](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
|
||||
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
|
||||
mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
|
||||
mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
|
||||
smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
|
||||
mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
|
||||
mscclpp::gpuMemcpy<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
|
||||
smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -360,7 +360,7 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt
|
||||
commPtr->smSemaphores = std::move(smSemaphores);
|
||||
commPtr->buffFlag = 0;
|
||||
commPtr->numScratchBuff = 2;
|
||||
commPtr->scratchBuff = mscclpp::allocExtSharedCuda<char>(SCRATCH_SIZE);
|
||||
commPtr->scratchBuff = mscclpp::GpuBuffer(SCRATCH_SIZE).memory();
|
||||
commPtr->remoteScratchRegMemories =
|
||||
setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
|
||||
}
|
||||
@@ -624,7 +624,6 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
|
||||
}
|
||||
|
||||
int rank = comm->comm->bootstrap()->getRank();
|
||||
int nRank = comm->comm->bootstrap()->getNranks();
|
||||
|
||||
std::vector<executionPlanInstance>& plans = comm->executionPlans["broadcast"];
|
||||
std::shared_ptr<mscclpp::ExecutionPlan> plan;
|
||||
@@ -817,18 +816,13 @@ NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
|
||||
// Allocate memory using mscclpp::allocSharedPhysicalCuda
|
||||
if (ptr == nullptr || size == 0) {
|
||||
WARN("ptr is nullptr or size is 0");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
std::shared_ptr<char> sharedPtr;
|
||||
try {
|
||||
if (mscclpp::isNvlsSupported()) {
|
||||
sharedPtr = mscclpp::allocSharedPhysicalCuda<char>(size);
|
||||
} else {
|
||||
sharedPtr = mscclpp::allocExtSharedCuda<char>(size);
|
||||
}
|
||||
sharedPtr = mscclpp::GpuBuffer(size).memory();
|
||||
if (sharedPtr == nullptr) {
|
||||
INFO(MSCCLPP_ALLOC, "Failed to allocate memory");
|
||||
return ncclSystemError;
|
||||
|
||||
@@ -14,6 +14,7 @@ from mscclpp import (
|
||||
ProxyService,
|
||||
Transport,
|
||||
)
|
||||
from mscclpp.utils import GpuBuffer
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
|
||||
def create_connection(group: mscclpp_comm.CommGroup, transport: str):
|
||||
@@ -32,7 +33,7 @@ if __name__ == "__main__":
|
||||
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
|
||||
connections = create_connection(mscclpp_group, "NVLink")
|
||||
nelems = 1024
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
proxy_service = ProxyService()
|
||||
simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
|
||||
proxy_service.start_proxy()
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#ifndef MSCCLPP_GPU_UTILS_HPP_
|
||||
#define MSCCLPP_GPU_UTILS_HPP_
|
||||
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#include "errors.hpp"
|
||||
@@ -35,19 +34,6 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
/// set memory access permission to read-write
|
||||
/// @param base Base memory pointer.
|
||||
/// @param size Size of the memory.
|
||||
inline void setReadWriteMemoryAccess(void* base, size_t size) {
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
int deviceId;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = deviceId;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
|
||||
}
|
||||
|
||||
/// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
|
||||
/// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
|
||||
struct AvoidCudaGraphCaptureGuard {
|
||||
@@ -64,101 +50,46 @@ struct CudaStreamWithFlags {
|
||||
cudaStream_t stream_;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct CudaDeleter;
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// A wrapper of cudaMalloc that sets the allocated memory to zero.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param nelem Number of elements to allocate.
|
||||
/// @return A pointer to the allocated memory.
|
||||
template <class T>
|
||||
T* cudaCalloc(size_t nelem) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
T* ptr;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
return ptr;
|
||||
}
|
||||
void setReadWriteMemoryAccess(void* base, size_t size);
|
||||
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
template <class T>
|
||||
T* cudaPhysicalCalloc(size_t nelems, size_t gran) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
int deviceId = -1;
|
||||
CUdevice currentDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGet(¤tDevice, deviceId));
|
||||
|
||||
CUmemAllocationProp prop = {};
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.requestedHandleTypes =
|
||||
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
prop.location.id = currentDevice;
|
||||
|
||||
// allocate physical memory
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
size_t nbytes = (nelems * sizeof(T) + gran - 1) / gran * gran;
|
||||
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
|
||||
|
||||
T* devicePtr = nullptr;
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
|
||||
setReadWriteMemoryAccess(devicePtr, nbytes);
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
|
||||
return devicePtr;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
T* cudaExtCalloc(size_t nelem) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
T* ptr;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
void* gpuCalloc(size_t bytes);
|
||||
void* gpuCallocHost(size_t bytes);
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, nelem * sizeof(T), hipDeviceMallocUncached));
|
||||
#else
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
|
||||
#endif
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
return ptr;
|
||||
}
|
||||
void* gpuCallocUncached(size_t bytes);
|
||||
#endif // defined(__HIP_PLATFORM_AMD__)
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0);
|
||||
#endif // CUDA_NVLS_SUPPORTED
|
||||
|
||||
/// A wrapper of cudaHostAlloc that sets the allocated memory to zero.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param nelem Number of elements to allocate.
|
||||
/// @return A pointer to the allocated memory.
|
||||
template <class T>
|
||||
T* cudaHostCalloc(size_t nelem) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
T* ptr;
|
||||
MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined));
|
||||
memset(ptr, 0, nelem * sizeof(T));
|
||||
return ptr;
|
||||
}
|
||||
void gpuFree(void* ptr);
|
||||
void gpuFreeHost(void* ptr);
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
void gpuFreePhysical(void* ptr);
|
||||
#endif // CUDA_NVLS_SUPPORTED
|
||||
|
||||
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
|
||||
cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||
|
||||
/// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
|
||||
/// destroyed.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @tparam alloc A function that allocates memory.
|
||||
/// @tparam Deleter A deleter that will be used to free the allocated memory.
|
||||
/// @tparam Memory The type of the returned object.
|
||||
/// @param nelem Number of elements to allocate.
|
||||
/// @tparam Alloc A function type that allocates memory.
|
||||
/// @tparam Args Input types of the @p alloc function variables.
|
||||
/// @param alloc A function that allocates memory.
|
||||
/// @param nelems Number of elements to allocate.
|
||||
/// @param args Extra input variables for the @p alloc function.
|
||||
/// @return An object of type @p Memory that will free the allocated memory when destroyed.
|
||||
///
|
||||
template <class T, T*(alloc)(size_t), class Deleter, class Memory>
|
||||
Memory safeAlloc(size_t nelem) {
|
||||
template <class T, class Deleter, class Memory, typename Alloc, typename... Args>
|
||||
Memory safeAlloc(Alloc alloc, size_t nelems, Args&&... args) {
|
||||
T* ptr = nullptr;
|
||||
try {
|
||||
ptr = alloc(nelem);
|
||||
ptr = reinterpret_cast<T*>(alloc(nelems * sizeof(T), std::forward<Args>(args)...));
|
||||
} catch (...) {
|
||||
if (ptr) {
|
||||
Deleter()(ptr);
|
||||
@@ -168,258 +99,164 @@ Memory safeAlloc(size_t nelem) {
|
||||
return Memory(ptr, Deleter());
|
||||
}
|
||||
|
||||
template <class T, T*(alloc)(size_t, size_t), class Deleter, class Memory>
|
||||
Memory safeAlloc(size_t nelem, size_t gran) {
|
||||
if ((nelem * sizeof(T)) % gran) {
|
||||
throw Error("The request allocation size is not divisible by the required granularity:" +
|
||||
std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
|
||||
ErrorCode::InvalidUsage);
|
||||
}
|
||||
T* ptr = nullptr;
|
||||
try {
|
||||
ptr = alloc(nelem, gran);
|
||||
} catch (...) {
|
||||
if (ptr) {
|
||||
Deleter()(ptr);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
return Memory(ptr, Deleter());
|
||||
/// A deleter that calls gpuFree for use with std::unique_ptr or std::shared_ptr.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
template <class T = void>
|
||||
struct GpuDeleter {
|
||||
void operator()(void* ptr) { gpuFree(ptr); }
|
||||
};
|
||||
|
||||
/// A deleter that calls gpuFreeHost for use with std::unique_ptr or std::shared_ptr.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
template <class T = void>
|
||||
struct GpuHostDeleter {
|
||||
void operator()(void* ptr) { gpuFreeHost(ptr); }
|
||||
};
|
||||
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
template <class T = void>
|
||||
struct GpuPhysicalDeleter {
|
||||
void operator()(void* ptr) { gpuFreePhysical(ptr); }
|
||||
};
|
||||
#endif // CUDA_NVLS_SUPPORTED
|
||||
|
||||
template <class T>
|
||||
using UniqueGpuPtr = std::unique_ptr<T, detail::GpuDeleter<T>>;
|
||||
|
||||
template <class T>
|
||||
using UniqueGpuHostPtr = std::unique_ptr<T, detail::GpuHostDeleter<T>>;
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocShared(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuDeleter<T>, std::shared_ptr<T>>(detail::gpuCalloc, nelems);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocUnique(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(detail::gpuCalloc, nelems);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocHostShared(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocHost, nelems);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocHostUnique(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(detail::gpuCallocHost, nelems);
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocUncachedShared(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocUncached, nelems);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocUncachedUnique(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(detail::gpuCallocUncached, nelems);
|
||||
}
|
||||
|
||||
#endif // defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
|
||||
template <class T>
|
||||
using UniqueGpuPhysicalPtr = std::unique_ptr<T, detail::GpuPhysicalDeleter<T>>;
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocPhysicalShared(size_t nelems = 1, size_t gran = 0, size_t align = 0) {
|
||||
return detail::safeAlloc<T, detail::GpuPhysicalDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocPhysical, nelems,
|
||||
gran, align);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0, size_t align = 0) {
|
||||
return detail::safeAlloc<T, detail::GpuPhysicalDeleter<T>, UniqueGpuPhysicalPtr<T>>(detail::gpuCallocPhysical, nelems,
|
||||
gran, align);
|
||||
}
|
||||
|
||||
size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
|
||||
|
||||
#endif // CUDA_NVLS_SUPPORTED
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
template <class T>
|
||||
struct CudaDeleter {
|
||||
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
|
||||
void operator()(TPtrOrArray ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFree(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct CudaPhysicalDeleter {
|
||||
static_assert(!std::is_array_v<T>, "T must not be an array");
|
||||
void operator()(T* ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
|
||||
MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
MSCCLPP_CUTHROW(cuMemRelease(handle));
|
||||
MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
}
|
||||
};
|
||||
|
||||
/// A deleter that calls cudaFreeHost for use with std::unique_ptr or std::shared_ptr.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
template <class T>
|
||||
struct CudaHostDeleter {
|
||||
using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
|
||||
void operator()(TPtrOrArray ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @return A std::shared_ptr to the allocated memory.
|
||||
template <class T>
|
||||
std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
template <class T = char>
|
||||
void gpuMemcpyAsync(T* dst, const T* src, size_t nelems, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
detail::gpuMemcpyAsync(dst, src, nelems * sizeof(T), stream, kind);
|
||||
}
|
||||
|
||||
template <class T = char>
|
||||
void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
|
||||
}
|
||||
|
||||
bool isNvlsSupported();
|
||||
|
||||
/// Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by
|
||||
/// `GpuBuffer::data()`.
|
||||
///
|
||||
/// Use this function for communication buffers, i.e., only when other devices (CPU, GPU, NIC, etc.) may access this
|
||||
/// memory space at the same time with the local device (GPU). Running heavy computation over this memory space
|
||||
/// may perform bad and is not recommended in general.
|
||||
///
|
||||
/// The allocated memory space is managed by the `memory_` object, not by the class instance. Which means,
|
||||
/// the class destructor will NOT free the allocated memory if `memory_` is shared with and alive in other contexts.
|
||||
///
|
||||
/// @tparam T Type of each element in the allocated memory. Default is `char`.
|
||||
///
|
||||
template <class T = char>
|
||||
class GpuBuffer {
|
||||
public:
|
||||
/// Constructs a GpuBuffer with the specified number of elements.
|
||||
/// @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer.
|
||||
GpuBuffer(size_t nelems) : nelems_(nelems) {
|
||||
if (nelems == 0) {
|
||||
bytes_ = 0;
|
||||
return;
|
||||
}
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
|
||||
size_t gran = 0;
|
||||
int numDevices = 0;
|
||||
MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
|
||||
if (isNvlsSupported()) {
|
||||
size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
|
||||
bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T);
|
||||
memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
|
||||
return;
|
||||
}
|
||||
#endif // CUDA_NVLS_SUPPORTED
|
||||
|
||||
CUmulticastObjectProp prop = {};
|
||||
prop.size = size;
|
||||
// This is a dummy value, it might affect the granularity in the future
|
||||
prop.numDevices = numDevices;
|
||||
prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
prop.flags = 0;
|
||||
MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
|
||||
return gran;
|
||||
}
|
||||
#endif
|
||||
|
||||
/// Allocates physical memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @param gran the granularity of the allocation.
|
||||
/// @return A std::shared_ptr to the allocated memory.
|
||||
template <class T>
|
||||
std::shared_ptr<T> allocSharedPhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
if (!isNvlsSupported()) {
|
||||
throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
|
||||
}
|
||||
if (count == 0) {
|
||||
return nullptr;
|
||||
bytes_ = nelems * sizeof(T);
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
memory_ = detail::gpuCallocUncachedShared<T>(nelems);
|
||||
#else // !defined(__HIP_PLATFORM_AMD__)
|
||||
memory_ = detail::gpuCallocShared<T>(nelems);
|
||||
#endif // !defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
if (gran == 0) {
|
||||
gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
|
||||
}
|
||||
size_t nelems = ((count * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
|
||||
return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>, std::shared_ptr<T>>(nelems, gran);
|
||||
#else
|
||||
throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
|
||||
#endif
|
||||
}
|
||||
/// Returns the number of elements in the allocated memory.
|
||||
/// @return The number of elements.
|
||||
size_t nelems() const { return nelems_; }
|
||||
|
||||
/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @return A std::shared_ptr to the allocated memory.
|
||||
template <class T>
|
||||
std::shared_ptr<T> allocExtSharedCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
}
|
||||
/// Returns the number of bytes that is actually allocated. This may be larger than `nelems() * sizeof(T)`.
|
||||
/// @return The number of bytes.
|
||||
size_t bytes() const { return bytes_; }
|
||||
|
||||
/// Unique device pointer that will call cudaFree on destruction.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
template <class T>
|
||||
using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
|
||||
/// Returns the shared pointer to the allocated memory.
|
||||
/// If `nelems()` is zero, this function will return an empty shared pointer.
|
||||
/// @return A `std::shared_ptr` to the allocated memory.
|
||||
std::shared_ptr<T> memory() { return memory_; }
|
||||
|
||||
/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @return A std::unique_ptr to the allocated memory.
|
||||
template <class T>
|
||||
UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
}
|
||||
/// Returns the device pointer to the allocated memory. Equivalent to `memory().get()`.
|
||||
/// If `nelems()` is zero, this function will return a null pointer.
|
||||
/// @return A device pointer to the allocated memory.
|
||||
T* data() { return memory_.get(); }
|
||||
|
||||
/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @return A std::unique_ptr to the allocated memory.
|
||||
template <class T>
|
||||
UniqueCudaPtr<T> allocExtUniqueCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
}
|
||||
|
||||
/// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
|
||||
/// @tparam T Type of the object to construct.
|
||||
/// @tparam Args Types of the arguments to pass to the constructor.
|
||||
/// @param args Arguments to pass to the constructor.
|
||||
/// @return A std::shared_ptr to the allocated memory.
|
||||
template <class T, typename... Args>
|
||||
std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
|
||||
new (ptr.get()) T(std::forward<Args>(args)...);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
|
||||
/// std::shared_ptr to it.
|
||||
/// @tparam T Type of the object to construct.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @return A std::shared_ptr to the allocated memory.
|
||||
template <class T>
|
||||
std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
|
||||
using TElem = std::remove_extent_t<T>;
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
new (&ptr[i]) TElem();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Unique CUDA host pointer that will call cudaFreeHost on destruction.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
template <class T>
|
||||
using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
|
||||
|
||||
/// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
|
||||
/// @tparam T Type of the object to construct.
|
||||
/// @tparam Args Types of the arguments to pass to the constructor.
|
||||
/// @param args Arguments to pass to the constructor.
|
||||
/// @return A std::unique_ptr to the allocated memory.
|
||||
template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
|
||||
auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
|
||||
new (ptr.get()) T(std::forward<Args>(args)...);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
|
||||
/// std::unique_ptr to it.
|
||||
/// @tparam T Type of the object to construct.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @return A std::unique_ptr to the allocated memory.
|
||||
template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
|
||||
UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
|
||||
using TElem = std::remove_extent_t<T>;
|
||||
auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
new (&ptr[i]) TElem();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
|
||||
/// The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @param gran the granularity of the allocation.
|
||||
/// @return A std::unique_ptr to the allocated memory.
|
||||
template <class T>
|
||||
std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
if (!isNvlsSupported()) {
|
||||
throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
|
||||
}
|
||||
if (count == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (gran == 0) {
|
||||
gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
|
||||
}
|
||||
return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
|
||||
std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
|
||||
#else
|
||||
throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Asynchronous cudaMemcpy without capture into a CUDA graph.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param dst Destination pointer.
|
||||
/// @param src Source pointer.
|
||||
/// @param count Number of elements to copy.
|
||||
/// @param stream CUDA stream to use.
|
||||
/// @param kind Type of cudaMemcpy to perform.
|
||||
template <class T>
|
||||
void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
|
||||
}
|
||||
|
||||
/// Synchronous cudaMemcpy without capture into a CUDA graph.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param dst Destination pointer.
|
||||
/// @param src Source pointer.
|
||||
/// @param count Number of elements to copy.
|
||||
/// @param kind Type of cudaMemcpy to perform.
|
||||
template <class T>
|
||||
void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
private:
|
||||
size_t nelems_;
|
||||
size_t bytes_;
|
||||
std::shared_ptr<T> memory_;
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
|
||||
@@ -83,18 +83,18 @@ class NpKit {
|
||||
// 64K * 2 (send/recv) * (1024/64) = 2M, 2M * 64 * 16B = 2GB per CPU
|
||||
static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
|
||||
|
||||
static std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> gpu_event_buffers_;
|
||||
static std::vector<mscclpp::detail::UniqueGpuPtr<NpKitEvent>> gpu_event_buffers_;
|
||||
static std::vector<std::unique_ptr<NpKitEvent[]>> cpu_event_buffers_;
|
||||
|
||||
static mscclpp::UniqueCudaPtr<NpKitEventCollectContext> gpu_collect_contexts_;
|
||||
static mscclpp::detail::UniqueGpuPtr<NpKitEventCollectContext> gpu_collect_contexts_;
|
||||
static std::unique_ptr<NpKitEventCollectContext[]> cpu_collect_contexts_;
|
||||
|
||||
static uint64_t rank_;
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
static mscclpp::UniqueCudaHostPtr<uint64_t[]> cpu_timestamp_;
|
||||
static mscclpp::detail::UniqueGpuHostPtr<uint64_t[]> cpu_timestamp_;
|
||||
#else
|
||||
static mscclpp::UniqueCudaHostPtr<uint64_t> cpu_timestamp_;
|
||||
static mscclpp::detail::UniqueGpuHostPtr<uint64_t> cpu_timestamp_;
|
||||
#endif
|
||||
static std::unique_ptr<std::thread> cpu_timestamp_update_thread_;
|
||||
static volatile bool cpu_timestamp_update_thread_should_stop_;
|
||||
|
||||
@@ -16,9 +16,6 @@ class NvlsConnection {
|
||||
NvlsConnection() = delete;
|
||||
std::vector<char> serialize();
|
||||
|
||||
// the recommended buffer size for NVLS, returned by cuMulticastGetGranularity
|
||||
static const int DefaultNvlsBufferSize;
|
||||
|
||||
// Everyone needs to synchronize after creating a NVLS connection before adding devices
|
||||
void addDevice();
|
||||
void addDevice(int cudaDeviceId);
|
||||
@@ -39,10 +36,10 @@ class NvlsConnection {
|
||||
friend class NvlsConnection;
|
||||
};
|
||||
|
||||
/// @brief bind the allocated memory via @ref mscclpp::allocSharedPhysicalCuda to the multicast handle. The behavior
|
||||
/// is undefined if the devicePtr is not allocated by @ref mscclpp::allocSharedPhysicalCuda.
|
||||
/// @param devicePtr
|
||||
/// @param size
|
||||
/// @brief bind the memory allocated via @ref mscclpp::GpuBuffer to the multicast handle. The behavior
|
||||
/// is undefined if the devicePtr is not allocated by @ref mscclpp::GpuBuffer.
|
||||
/// @param devicePtr The device pointer returned by `mscclpp::GpuBuffer::data()`.
|
||||
/// @param size The bytes of the memory to bind to the multicast handle.
|
||||
/// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
|
||||
DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);
|
||||
|
||||
@@ -65,7 +62,7 @@ class Communicator;
|
||||
/// @param config The configuration for the local endpoint.
|
||||
/// @return std::shared_ptr<NvlsConnection> A shared pointer to the NVLS connection.
|
||||
std::shared_ptr<NvlsConnection> connectNvlsCollective(std::shared_ptr<Communicator> comm, std::vector<int> allRanks,
|
||||
size_t bufferSize = NvlsConnection::DefaultNvlsBufferSize);
|
||||
size_t bufferSize);
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ class BaseSemaphore {
|
||||
};
|
||||
|
||||
/// A semaphore for sending signals from the host to the device.
|
||||
class Host2DeviceSemaphore : public BaseSemaphore<CudaDeleter, std::default_delete> {
|
||||
class Host2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, std::default_delete> {
|
||||
private:
|
||||
std::shared_ptr<Connection> connection_;
|
||||
|
||||
@@ -117,7 +117,7 @@ class Host2HostSemaphore : public BaseSemaphore<std::default_delete, std::defaul
|
||||
};
|
||||
|
||||
/// A semaphore for sending signals from the local device to a peer device via SM.
|
||||
class SmDevice2DeviceSemaphore : public BaseSemaphore<CudaDeleter, CudaDeleter> {
|
||||
class SmDevice2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, detail::GpuDeleter> {
|
||||
public:
|
||||
/// Constructor.
|
||||
/// @param communicator The communicator.
|
||||
|
||||
@@ -37,8 +37,6 @@ struct ScopedTimer : public Timer {
|
||||
|
||||
std::string getHostName(int maxlen, const char delim);
|
||||
|
||||
bool isNvlsSupported();
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_UTILS_HPP_
|
||||
|
||||
@@ -24,9 +24,9 @@ from ._mscclpp import (
|
||||
Executor,
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
RawGpuBuffer,
|
||||
version,
|
||||
is_nvls_supported,
|
||||
alloc_shared_physical_cuda,
|
||||
npkit,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,30 +1,18 @@
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/shared_ptr.h>
|
||||
|
||||
// #include <memory>
|
||||
#include <mscclpp/gpu_data_types.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
|
||||
namespace nb = nanobind;
|
||||
using namespace mscclpp;
|
||||
|
||||
class PyCudaMemory {
|
||||
public:
|
||||
PyCudaMemory(size_t size) : size_(size) { ptr_ = allocSharedPhysicalCuda<char>(size); }
|
||||
|
||||
uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
|
||||
size_t size() const { return size_; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<char> ptr_;
|
||||
size_t size_;
|
||||
};
|
||||
|
||||
void register_gpu_utils(nb::module_& m) {
|
||||
nb::class_<PyCudaMemory>(m, "PyCudaMemory")
|
||||
.def(nb::init<size_t>(), nb::arg("size"))
|
||||
.def("get_ptr", &PyCudaMemory::getPtr, "Get the raw pointer")
|
||||
.def("size", &PyCudaMemory::size, "Get the size of the allocated memory");
|
||||
m.def(
|
||||
"alloc_shared_physical_cuda", [](size_t size) { return std::make_shared<PyCudaMemory>(size); }, nb::arg("size"));
|
||||
m.def("is_nvls_supported", &isNvlsSupported);
|
||||
|
||||
nb::class_<GpuBuffer<char>>(m, "RawGpuBuffer")
|
||||
.def(nb::init<size_t>(), nb::arg("nelems"))
|
||||
.def("nelems", &GpuBuffer<char>::nelems)
|
||||
.def("bytes", &GpuBuffer<char>::bytes)
|
||||
.def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); });
|
||||
}
|
||||
|
||||
@@ -34,5 +34,5 @@ void register_nvls(nb::module_& m) {
|
||||
.def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
|
||||
|
||||
m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),
|
||||
nb::arg("bufferSize") = NvlsConnection::DefaultNvlsBufferSize);
|
||||
nb::arg("bufferSize"));
|
||||
}
|
||||
|
||||
@@ -6,10 +6,11 @@ import os
|
||||
import struct
|
||||
import subprocess
|
||||
import tempfile
|
||||
from typing import Any, Type
|
||||
from typing import Any, Type, Union, Tuple
|
||||
|
||||
import cupy as cp
|
||||
import numpy as np
|
||||
from ._mscclpp import RawGpuBuffer
|
||||
|
||||
try:
|
||||
import torch
|
||||
@@ -36,7 +37,7 @@ class Kernel:
|
||||
nblocks: int,
|
||||
nthreads: int,
|
||||
shared: int,
|
||||
stream: Type[cp.cuda.Stream] or Type[None],
|
||||
stream: Union[cp.cuda.Stream, None],
|
||||
):
|
||||
buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
|
||||
buffer_size = ctypes.c_size_t(len(params))
|
||||
@@ -137,6 +138,25 @@ class KernelBuilder:
|
||||
self._tempdir.cleanup()
|
||||
|
||||
|
||||
class GpuBuffer(cp.ndarray):
|
||||
def __new__(
|
||||
cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
|
||||
):
|
||||
# Check if `shape` is valid
|
||||
if isinstance(shape, int):
|
||||
shape = (shape,)
|
||||
try:
|
||||
shape = tuple(shape)
|
||||
except TypeError:
|
||||
raise ValueError("Shape must be a tuple-like or an integer.")
|
||||
if any(s <= 0 for s in shape):
|
||||
raise ValueError("Shape must be positive.")
|
||||
# Create the buffer
|
||||
buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
|
||||
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
|
||||
return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
|
||||
|
||||
|
||||
def pack(*args):
|
||||
res = b""
|
||||
for arg in list(args):
|
||||
|
||||
@@ -20,5 +20,4 @@ void register_utils(nb::module_& m) {
|
||||
nb::class_<ScopedTimer, Timer>(m, "ScopedTimer").def(nb::init<std::string>(), nb::arg("name"));
|
||||
|
||||
m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim"));
|
||||
m.def("is_nvls_supported", &isNvlsSupported);
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ from mpi4py import MPI
|
||||
import cupy.cuda.nccl as nccl
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp import ProxyService, is_nvls_supported
|
||||
from mscclpp.utils import GpuBuffer
|
||||
from prettytable import PrettyTable
|
||||
import netifaces as ni
|
||||
import ipaddress
|
||||
@@ -162,8 +163,8 @@ def find_best_config(mscclpp_call, niter):
|
||||
def run_benchmark(
|
||||
mscclpp_group: mscclpp_comm.CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int
|
||||
):
|
||||
memory = cp.zeros(nelem, dtype=data_type)
|
||||
memory_out = cp.zeros(nelem, dtype=data_type)
|
||||
memory = GpuBuffer(nelem, dtype=data_type)
|
||||
memory_out = GpuBuffer(nelem, dtype=data_type)
|
||||
cp.cuda.runtime.deviceSynchronize()
|
||||
|
||||
proxy_service = ProxyService()
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import os
|
||||
import cupy as cp
|
||||
import ctypes
|
||||
from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore, alloc_shared_physical_cuda
|
||||
from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, pack
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
|
||||
|
||||
IB_TRANSPORTS = [
|
||||
@@ -115,7 +115,7 @@ class MscclppAllReduce2:
|
||||
self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
|
||||
type_str = type_to_str(memory.dtype)
|
||||
|
||||
self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
|
||||
self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
|
||||
# create a sm_channel for each remote neighbor
|
||||
self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, self.connections)
|
||||
file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -179,7 +179,7 @@ class MscclppAllReduce3:
|
||||
type_str = type_to_str(memory.dtype)
|
||||
|
||||
self.proxy_service = proxy_service
|
||||
self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
|
||||
self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
|
||||
|
||||
# create a sm_channel for each remote neighbor
|
||||
self.fst_round_proxy_chans = self.group.make_proxy_channels_with_scratch(
|
||||
@@ -259,7 +259,7 @@ class MscclppAllReduce4:
|
||||
type_str = type_to_str(memory.dtype)
|
||||
|
||||
self.proxy_service = proxy_service
|
||||
self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
|
||||
self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
|
||||
same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
|
||||
# create a sm_channel for each remote neighbor
|
||||
self.sm_channels = self.group.make_sm_channels(self.memory, same_node_connections)
|
||||
@@ -362,8 +362,8 @@ class MscclppAllReduce5:
|
||||
type_str = type_to_str(memory.dtype)
|
||||
|
||||
self.proxy_service = proxy_service
|
||||
self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
|
||||
self.put_buff = cp.zeros(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
|
||||
self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
|
||||
self.put_buff = GpuBuffer(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
|
||||
same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
|
||||
across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)}
|
||||
# create a sm_channel for each remote neighbor
|
||||
@@ -441,18 +441,10 @@ class MscclppAllReduce6:
|
||||
# create a connection for each remote neighbor
|
||||
self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
|
||||
self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
|
||||
min_gran = self.nvls_connection.get_multicast_min_granularity()
|
||||
aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
|
||||
buffer_raw = alloc_shared_physical_cuda(aligned_buffer_size)
|
||||
self.memory = GpuBuffer(nelem, memory_dtype)
|
||||
self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
|
||||
buffer_raw.get_ptr(), aligned_buffer_size
|
||||
) # just using recommended size for now
|
||||
self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
|
||||
|
||||
self.cp_memory_ptr = cp.cuda.MemoryPointer(
|
||||
cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, buffer_raw), 0
|
||||
self.memory.data.ptr, self.memory.data.mem.size
|
||||
)
|
||||
self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)
|
||||
|
||||
# create a sm_channel for each remote neighbor
|
||||
self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore)
|
||||
|
||||
@@ -8,11 +8,9 @@ from mscclpp import (
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
npkit,
|
||||
alloc_shared_physical_cuda,
|
||||
is_nvls_supported,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, pack
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
import os
|
||||
import struct
|
||||
|
||||
@@ -129,18 +127,6 @@ def dtype_to_mscclpp_dtype(dtype):
|
||||
raise ValueError(f"Unknown data type: {dtype}")
|
||||
|
||||
|
||||
def allocate_buffer(nelems, dtype):
|
||||
if is_nvls_supported():
|
||||
buffer_raw = alloc_shared_physical_cuda(nelems * cp.dtype(dtype).itemsize)
|
||||
buffer_ptr = cp.cuda.MemoryPointer(
|
||||
cp.cuda.UnownedMemory(buffer_raw.get_ptr(), buffer_raw.size(), buffer_raw), 0
|
||||
)
|
||||
buffer = cp.ndarray(nelems, dtype=dtype, memptr=buffer_ptr)
|
||||
return buffer
|
||||
else:
|
||||
return cp.zeros(nelems, dtype=dtype)
|
||||
|
||||
|
||||
def build_bufs(
|
||||
collective: str,
|
||||
size: int,
|
||||
@@ -160,14 +146,14 @@ def build_bufs(
|
||||
nelems_input = nelems
|
||||
nelems_output = nelems
|
||||
|
||||
result_buf = allocate_buffer(nelems_output, dtype=dtype)
|
||||
result_buf = GpuBuffer(nelems_output, dtype=dtype)
|
||||
if in_place:
|
||||
if "allgather" in collective:
|
||||
input_buf = cp.split(result_buf, num_ranks)[rank]
|
||||
else:
|
||||
input_buf = result_buf
|
||||
else:
|
||||
input_buf = allocate_buffer(nelems_input, dtype=dtype)
|
||||
input_buf = GpuBuffer(nelems_input, dtype=dtype)
|
||||
test_buf = cp.zeros(nelems_output, dtype=dtype)
|
||||
|
||||
return input_buf, result_buf, test_buf
|
||||
|
||||
@@ -27,7 +27,7 @@ from mscclpp import (
|
||||
npkit,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, pack
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
from ._cpp import _ext
|
||||
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
|
||||
|
||||
@@ -156,12 +156,26 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
|
||||
create_group_and_connection(mpi_group, transport)
|
||||
|
||||
|
||||
@parametrize_mpi_groups(1)
|
||||
@pytest.mark.parametrize("nelem", [2**i for i in [0, 10, 15, 20]])
|
||||
@pytest.mark.parametrize("dtype", [cp.float32, cp.float16])
|
||||
def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
|
||||
memory = GpuBuffer(nelem, dtype=dtype)
|
||||
assert memory.shape == (nelem,)
|
||||
assert memory.dtype == dtype
|
||||
assert memory.itemsize == cp.dtype(dtype).itemsize
|
||||
assert memory.nbytes == nelem * cp.dtype(dtype).itemsize
|
||||
assert memory.data.ptr != 0
|
||||
assert memory.data.mem.ptr != 0
|
||||
assert memory.data.mem.size >= nelem * cp.dtype(dtype).itemsize
|
||||
|
||||
|
||||
@parametrize_mpi_groups(2, 4, 8, 16)
|
||||
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
|
||||
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
|
||||
def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
|
||||
group, connections = create_group_and_connection(mpi_group, transport)
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
nelemPerRank = nelem // group.nranks
|
||||
sizePerRank = nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
@@ -436,13 +450,12 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
|
||||
def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
|
||||
group, connections = create_group_and_connection(mpi_group, "NVLink")
|
||||
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
if use_packet:
|
||||
scratch = cp.zeros(nelem * 2, dtype=cp.int32)
|
||||
scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
|
||||
else:
|
||||
scratch = None
|
||||
nelemPerRank = nelem // group.nranks
|
||||
nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
memory_expected = cp.zeros_like(memory)
|
||||
for rank in range(group.nranks):
|
||||
@@ -484,7 +497,7 @@ def test_fifo(
|
||||
def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
|
||||
group, connections = create_group_and_connection(mpi_group, transport)
|
||||
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
nelemPerRank = nelem // group.nranks
|
||||
nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
@@ -534,11 +547,11 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
|
||||
def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
|
||||
group, connections = create_group_and_connection(mpi_group, transport)
|
||||
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
if use_packet:
|
||||
scratch = cp.zeros(nelem * 2, dtype=cp.int32)
|
||||
scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
|
||||
else:
|
||||
scratch = cp.zeros(1, dtype=cp.int32) # just so that we can pass a valid ptr
|
||||
scratch = GpuBuffer(1, dtype=cp.int32) # just so that we can pass a valid ptr
|
||||
nelemPerRank = nelem // group.nranks
|
||||
nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
|
||||
@@ -301,8 +301,8 @@ void EthernetConnection::write(RegisteredMemory dst, uint64_t dstOffset, Registe
|
||||
uint64_t dataSize =
|
||||
std::min(sendBufferSize_ - headerSize / sizeof(char), (size - sentDataSize) / sizeof(char)) * sizeof(char);
|
||||
uint64_t messageSize = dataSize + headerSize;
|
||||
mscclpp::memcpyCuda<char>(sendBuffer_.data() + headerSize / sizeof(char),
|
||||
(char*)srcPtr + (sentDataSize / sizeof(char)), dataSize, cudaMemcpyDeviceToHost);
|
||||
mscclpp::gpuMemcpy(sendBuffer_.data() + headerSize / sizeof(char), srcPtr + (sentDataSize / sizeof(char)), dataSize,
|
||||
cudaMemcpyDeviceToHost);
|
||||
sendSocket_->send(sendBuffer_.data(), messageSize);
|
||||
sentDataSize += messageSize;
|
||||
headerSize = 0;
|
||||
@@ -402,8 +402,7 @@ void EthernetConnection::recvMessages() {
|
||||
received &= !closed;
|
||||
|
||||
if (received)
|
||||
mscclpp::memcpyCuda<char>((char*)ptr + (recvSize / sizeof(char)), recvBuffer_.data(), messageSize,
|
||||
cudaMemcpyHostToDevice);
|
||||
mscclpp::gpuMemcpy(ptr + (recvSize / sizeof(char)), recvBuffer_.data(), messageSize, cudaMemcpyHostToDevice);
|
||||
recvSize += messageSize;
|
||||
}
|
||||
|
||||
|
||||
@@ -155,10 +155,10 @@ struct Executor::Impl {
|
||||
plan.impl_->lightLoadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
|
||||
this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
|
||||
this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
|
||||
allocExtSharedCuda<char>(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
|
||||
memcpyCuda(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)devicePlans[devicePlanKey].data(),
|
||||
devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
|
||||
GpuBuffer(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
|
||||
gpuMemcpy(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)devicePlans[devicePlanKey].data(),
|
||||
devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
|
||||
this->contexts[key].currentDevicePlan = devicePlanKey;
|
||||
return this->contexts[key];
|
||||
}
|
||||
@@ -170,12 +170,7 @@ struct Executor::Impl {
|
||||
size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
|
||||
size_t scratchBufferSize =
|
||||
std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange, recvMemRange), maxScratchBufferSize);
|
||||
std::shared_ptr<char> scratchBuffer;
|
||||
if (isNvlsSupported()) {
|
||||
scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
|
||||
} else {
|
||||
scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
|
||||
}
|
||||
std::shared_ptr<char> scratchBuffer = GpuBuffer(scratchBufferSize).memory();
|
||||
context.scratchBuffer = scratchBuffer;
|
||||
context.scratchBufferSize = scratchBufferSize;
|
||||
context.proxyService = std::make_shared<ProxyService>();
|
||||
@@ -186,11 +181,10 @@ struct Executor::Impl {
|
||||
this->setupNvlsChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
|
||||
this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
|
||||
context.deviceExecutionPlansBuffers[devicePlanKey] =
|
||||
allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
|
||||
memcpyCuda(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)context.deviceExecutionPlans[devicePlanKey].data(),
|
||||
context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan),
|
||||
cudaMemcpyHostToDevice);
|
||||
GpuBuffer(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
|
||||
gpuMemcpy(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
|
||||
(char*)context.deviceExecutionPlans[devicePlanKey].data(),
|
||||
context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
|
||||
context.currentDevicePlan = devicePlanKey;
|
||||
context.proxyService->startProxy();
|
||||
this->contexts.insert({key, context});
|
||||
|
||||
12
src/fifo.cc
12
src/fifo.cc
@@ -10,9 +10,9 @@
|
||||
namespace mscclpp {
|
||||
|
||||
struct Fifo::Impl {
|
||||
UniqueCudaHostPtr<ProxyTrigger[]> triggers;
|
||||
UniqueCudaPtr<uint64_t> head;
|
||||
UniqueCudaPtr<uint64_t> tailReplica;
|
||||
detail::UniqueGpuHostPtr<ProxyTrigger> triggers;
|
||||
detail::UniqueGpuPtr<uint64_t> head;
|
||||
detail::UniqueGpuPtr<uint64_t> tailReplica;
|
||||
const int size;
|
||||
|
||||
// allocated on the host. Only accessed by the host. This is a copy of the
|
||||
@@ -28,9 +28,9 @@ struct Fifo::Impl {
|
||||
CudaStreamWithFlags stream;
|
||||
|
||||
Impl(int size)
|
||||
: triggers(makeUniqueCudaHost<ProxyTrigger[]>(size)),
|
||||
head(allocUniqueCuda<uint64_t>()),
|
||||
tailReplica(allocUniqueCuda<uint64_t>()),
|
||||
: triggers(detail::gpuCallocHostUnique<ProxyTrigger>(size)),
|
||||
head(detail::gpuCallocUnique<uint64_t>()),
|
||||
tailReplica(detail::gpuCallocUnique<uint64_t>()),
|
||||
size(size),
|
||||
hostTail(0),
|
||||
stream(cudaStreamNonBlocking) {}
|
||||
|
||||
165
src/gpu_utils.cc
Normal file
165
src/gpu_utils.cc
Normal file
@@ -0,0 +1,165 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <cstring>
|
||||
#include <mscclpp/gpu.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// set memory access permission to read-write
|
||||
/// @param base Base memory pointer.
|
||||
/// @param size Size of the memory.
|
||||
void setReadWriteMemoryAccess(void* base, size_t size) {
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
int deviceId;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = deviceId;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
|
||||
}
|
||||
|
||||
void* gpuCalloc(size_t bytes) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
void* ptr;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&ptr, bytes));
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void* gpuCallocHost(size_t bytes) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
void* ptr;
|
||||
MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, cudaHostAllocMapped | cudaHostAllocWriteCombined));
|
||||
::memset(ptr, 0, bytes);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
void* gpuCallocUncached(size_t bytes) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
void* ptr;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, bytes, hipDeviceMallocUncached));
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
return ptr;
|
||||
}
|
||||
#endif // defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
|
||||
size_t gran = 0;
|
||||
int numDevices = 0;
|
||||
MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
|
||||
|
||||
CUmulticastObjectProp prop = {};
|
||||
prop.size = size;
|
||||
// This is a dummy value, it might affect the granularity in the future
|
||||
prop.numDevices = numDevices;
|
||||
prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
prop.flags = 0;
|
||||
MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
|
||||
return gran;
|
||||
}
|
||||
|
||||
void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
int deviceId = -1;
|
||||
CUdevice currentDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGet(¤tDevice, deviceId));
|
||||
|
||||
CUmemAllocationProp prop = {};
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.requestedHandleTypes =
|
||||
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
prop.location.id = currentDevice;
|
||||
|
||||
if (gran == 0) {
|
||||
gran = getMulticastGranularity(bytes, CU_MULTICAST_GRANULARITY_RECOMMENDED);
|
||||
}
|
||||
|
||||
// allocate physical memory
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
size_t nbytes = (bytes + gran - 1) / gran * gran;
|
||||
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
|
||||
|
||||
if (align == 0) {
|
||||
align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
|
||||
}
|
||||
|
||||
void* devicePtr = nullptr;
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, align, 0U, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
|
||||
setReadWriteMemoryAccess(devicePtr, nbytes);
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
|
||||
return devicePtr;
|
||||
}
|
||||
#endif // CUDA_NVLS_SUPPORTED
|
||||
|
||||
void gpuFree(void* ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFree(ptr));
|
||||
}
|
||||
|
||||
void gpuFreeHost(void* ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
|
||||
}
|
||||
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
void gpuFreePhysical(void* ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
|
||||
MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
MSCCLPP_CUTHROW(cuMemRelease(handle));
|
||||
MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
}
|
||||
#endif // CUDA_NVLS_SUPPORTED
|
||||
|
||||
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
|
||||
}
|
||||
|
||||
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
bool isNvlsSupported() {
|
||||
[[maybe_unused]] static bool result = false;
|
||||
[[maybe_unused]] static bool isChecked = false;
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
if (!isChecked) {
|
||||
int isMulticastSupported;
|
||||
int isFabricSupported;
|
||||
CUdevice dev;
|
||||
MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
|
||||
result = (isMulticastSupported == 1 && isFabricSupported == 1);
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
@@ -12,16 +12,16 @@
|
||||
|
||||
uint64_t NpKit::rank_ = 0;
|
||||
|
||||
std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
|
||||
std::vector<mscclpp::detail::UniqueGpuPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
|
||||
std::vector<std::unique_ptr<NpKitEvent[]>> NpKit::cpu_event_buffers_;
|
||||
|
||||
mscclpp::UniqueCudaPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
|
||||
mscclpp::detail::UniqueGpuPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
|
||||
std::unique_ptr<NpKitEventCollectContext[]> NpKit::cpu_collect_contexts_;
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
mscclpp::UniqueCudaHostPtr<uint64_t[]> NpKit::cpu_timestamp_;
|
||||
mscclpp::detail::UniqueGpuHostPtr<uint64_t[]> NpKit::cpu_timestamp_;
|
||||
#else
|
||||
mscclpp::UniqueCudaHostPtr<uint64_t> NpKit::cpu_timestamp_;
|
||||
mscclpp::detail::UniqueGpuHostPtr<uint64_t> NpKit::cpu_timestamp_;
|
||||
#endif
|
||||
std::unique_ptr<std::thread> NpKit::cpu_timestamp_update_thread_;
|
||||
volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
|
||||
@@ -53,11 +53,11 @@ void NpKit::Init(int rank) {
|
||||
rank_ = rank;
|
||||
|
||||
// Init event data structures
|
||||
gpu_collect_contexts_ = mscclpp::allocUniqueCuda<NpKitEventCollectContext>(NpKit::kNumGpuEventBuffers);
|
||||
gpu_collect_contexts_ = mscclpp::detail::gpuCallocUnique<NpKitEventCollectContext>(NpKit::kNumGpuEventBuffers);
|
||||
for (i = 0; i < NpKit::kNumGpuEventBuffers; i++) {
|
||||
gpu_event_buffers_.emplace_back(mscclpp::allocUniqueCuda<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
|
||||
gpu_event_buffers_.emplace_back(mscclpp::detail::gpuCallocUnique<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
|
||||
ctx.event_buffer = gpu_event_buffers_[i].get();
|
||||
mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1);
|
||||
mscclpp::gpuMemcpy(gpu_collect_contexts_.get() + i, &ctx, 1);
|
||||
}
|
||||
|
||||
cpu_collect_contexts_ = std::make_unique<NpKitEventCollectContext[]>(NpKit::kNumCpuEventBuffers);
|
||||
@@ -69,15 +69,15 @@ void NpKit::Init(int rank) {
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
// Init timestamp. Allocates MAXCHANNELS*128 bytes buffer for GPU
|
||||
cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t[]>(NPKIT_MAX_NUM_GPU_THREADBLOCKS *
|
||||
NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t));
|
||||
cpu_timestamp_ = mscclpp::detail::gpuCallocHostUnique<uint64_t[]>(NPKIT_MAX_NUM_GPU_THREADBLOCKS *
|
||||
NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t));
|
||||
for (int i = 0; i < NPKIT_MAX_NUM_GPU_THREADBLOCKS; i++) {
|
||||
NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(cpu_timestamp_.get(),
|
||||
std::chrono::system_clock::now().time_since_epoch().count(), i);
|
||||
}
|
||||
#else
|
||||
// Init timestamp
|
||||
cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t>();
|
||||
cpu_timestamp_ = mscclpp::detail::gpuCallocHostUnique<uint64_t>();
|
||||
volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_.get();
|
||||
*volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count();
|
||||
#endif
|
||||
@@ -153,8 +153,8 @@ void NpKit::Dump(const std::string& dump_dir) {
|
||||
dump_file_path += std::to_string(rank_);
|
||||
dump_file_path += "_buf_";
|
||||
dump_file_path += std::to_string(i);
|
||||
mscclpp::memcpyCuda(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
|
||||
mscclpp::memcpyCuda(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
|
||||
mscclpp::gpuMemcpy(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
|
||||
mscclpp::gpuMemcpy(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
|
||||
auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
|
||||
gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0].get()),
|
||||
cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
|
||||
|
||||
@@ -58,7 +58,7 @@ NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) {
|
||||
mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
|
||||
MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_;
|
||||
mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
|
||||
bufferSize_ = mcProp_.size;
|
||||
MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
|
||||
mcFileDesc_ = 0;
|
||||
@@ -200,7 +200,7 @@ std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUdeviceptr devicePtr, si
|
||||
char* mcPtr;
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
|
||||
setReadWriteMemoryAccess(mcPtr, devBuffSize);
|
||||
detail::setReadWriteMemoryAccess(mcPtr, devBuffSize);
|
||||
INFO(MSCCLPP_COLL, "NVLS connection bound memory at offset %ld, size %ld", offset, devBuffSize);
|
||||
|
||||
auto deleter = [=, self = shared_from_this()](char* ptr) {
|
||||
@@ -240,8 +240,6 @@ class NvlsConnection::Impl {
|
||||
};
|
||||
#endif // !(CUDA_NVLS_SUPPORTED)
|
||||
|
||||
const int NvlsConnection::DefaultNvlsBufferSize = (1 << 29);
|
||||
|
||||
NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
|
||||
: pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}
|
||||
|
||||
|
||||
@@ -244,7 +244,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
|
||||
size_t gran = getRecommendedGranularity();
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&base, this->size, gran, 0, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)base, this->size, 0, handle, 0));
|
||||
setReadWriteMemoryAccess(base, this->size);
|
||||
detail::setReadWriteMemoryAccess(base, this->size);
|
||||
this->data = static_cast<char*>(base) + entry.offsetFromBase;
|
||||
} else {
|
||||
MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
|
||||
|
||||
@@ -19,9 +19,17 @@ static NonblockingFuture<RegisteredMemory> setupInboundSemaphoreId(Communicator&
|
||||
return communicator.recvMemoryOnSetup(remoteRank, tag);
|
||||
}
|
||||
|
||||
static detail::UniqueGpuPtr<uint64_t> createGpuSemaphoreId() {
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
return detail::gpuCallocUncachedUnique<uint64_t>();
|
||||
#else // !defined(__HIP_PLATFORM_AMD__)
|
||||
return detail::gpuCallocUnique<uint64_t>();
|
||||
#endif // !defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator,
|
||||
std::shared_ptr<Connection> connection)
|
||||
: BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), std::make_unique<uint64_t>()),
|
||||
: BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), std::make_unique<uint64_t>()),
|
||||
connection_(connection) {
|
||||
INFO(MSCCLPP_INIT, "Creating a Host2Device semaphore for %s transport from %d to %d",
|
||||
connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
|
||||
@@ -85,7 +93,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
|
||||
|
||||
MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& communicator,
|
||||
std::shared_ptr<Connection> connection)
|
||||
: BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>()) {
|
||||
: BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), createGpuSemaphoreId()) {
|
||||
INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d",
|
||||
connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
|
||||
communicator.remoteRankOf(*connection));
|
||||
|
||||
19
src/utils.cc
19
src/utils.cc
@@ -7,7 +7,6 @@
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <mscclpp/errors.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
@@ -67,22 +66,4 @@ std::string getHostName(int maxlen, const char delim) {
|
||||
return hostname.substr(0, i);
|
||||
}
|
||||
|
||||
bool isNvlsSupported() {
|
||||
[[maybe_unused]] static bool result = false;
|
||||
[[maybe_unused]] static bool isChecked = false;
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
if (!isChecked) {
|
||||
int isMulticastSupported;
|
||||
int isFabricSupported;
|
||||
CUdevice dev;
|
||||
MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
|
||||
result = (isMulticastSupported == 1 && isFabricSupported == 1);
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -129,12 +129,7 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
|
||||
mscclpp::ExecutionPlan plan(executionPlanPath);
|
||||
std::shared_ptr<char> sendbuff;
|
||||
if (mscclpp::isNvlsSupported()) {
|
||||
sendbuff = mscclpp::allocSharedPhysicalCuda<char>(bufferSize);
|
||||
} else {
|
||||
sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
|
||||
}
|
||||
std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
|
||||
std::vector<int> dataHost(bufferSize / sizeof(int), rank);
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice));
|
||||
double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType);
|
||||
|
||||
@@ -113,7 +113,7 @@ void CommunicatorTest::SetUp() {
|
||||
}
|
||||
|
||||
for (size_t n = 0; n < numBuffers; n++) {
|
||||
devicePtr[n] = mscclpp::allocSharedCuda<int>(deviceBufferSize / sizeof(int));
|
||||
devicePtr[n] = mscclpp::detail::gpuCallocShared<int>(deviceBufferSize / sizeof(int));
|
||||
registerMemoryPairs(devicePtr[n].get(), deviceBufferSize, mscclpp::Transport::CudaIpc | ibTransport, 0, remoteRanks,
|
||||
localMemory[n], remoteMemory[n]);
|
||||
}
|
||||
@@ -133,7 +133,7 @@ void CommunicatorTest::deviceBufferInit() {
|
||||
for (size_t i = 0; i < dataCount; i++) {
|
||||
hostBuffer[i] = gEnv->rank + n * gEnv->worldSize;
|
||||
}
|
||||
mscclpp::memcpyCuda<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
|
||||
mscclpp::gpuMemcpy<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,7 +155,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
|
||||
size_t dataCount = deviceBufferSize / sizeof(int);
|
||||
for (int n = 0; n < (int)devicePtr.size(); n++) {
|
||||
std::vector<int> hostBuffer(dataCount, 0);
|
||||
mscclpp::memcpyCuda<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
|
||||
mscclpp::gpuMemcpy<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
|
||||
for (int i = 0; i < gEnv->worldSize; i++) {
|
||||
if (((i / gEnv->nRanksPerNode) == (gEnv->rank / gEnv->nRanksPerNode)) && skipLocal) {
|
||||
continue;
|
||||
@@ -214,12 +214,13 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
|
||||
deviceBufferInit();
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
auto deviceSemaphoreHandles = mscclpp::allocSharedCuda<mscclpp::Host2DeviceSemaphore::DeviceHandle>(gEnv->worldSize);
|
||||
auto deviceSemaphoreHandles =
|
||||
mscclpp::detail::gpuCallocShared<mscclpp::Host2DeviceSemaphore::DeviceHandle>(gEnv->worldSize);
|
||||
for (int i = 0; i < gEnv->worldSize; i++) {
|
||||
if (i != gEnv->rank) {
|
||||
mscclpp::Host2DeviceSemaphore::DeviceHandle deviceHandle = semaphores[i]->deviceHandle();
|
||||
mscclpp::memcpyCuda<mscclpp::Host2DeviceSemaphore::DeviceHandle>(deviceSemaphoreHandles.get() + i, &deviceHandle,
|
||||
1, cudaMemcpyHostToDevice);
|
||||
mscclpp::gpuMemcpy<mscclpp::Host2DeviceSemaphore::DeviceHandle>(deviceSemaphoreHandles.get() + i, &deviceHandle,
|
||||
1, cudaMemcpyHostToDevice);
|
||||
}
|
||||
}
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
@@ -57,7 +57,7 @@ TEST_F(ExecutorTest, TwoNodesAllreduce) {
|
||||
path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
|
||||
mscclpp::ExecutionPlan plan(executionFilesPath.string());
|
||||
const int bufferSize = 1024 * 1024;
|
||||
std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
|
||||
std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
|
||||
mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16,
|
||||
plan, stream);
|
||||
|
||||
@@ -82,7 +82,7 @@ TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
|
||||
|
||||
const int maxIter = 100000;
|
||||
const int nelem = 1;
|
||||
auto data = mscclpp::allocUniqueCuda<int>(nelem);
|
||||
auto data = mscclpp::detail::gpuCallocUnique<int>(nelem);
|
||||
|
||||
registerBufferAndConnect(data.get(), sizeof(int) * nelem);
|
||||
|
||||
@@ -196,7 +196,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
|
||||
const uint64_t signalPeriod = 1024;
|
||||
const uint64_t maxIter = 10000;
|
||||
const uint64_t nelem = 65536 + 1;
|
||||
auto data = mscclpp::allocUniqueCuda<uint64_t>(nelem);
|
||||
auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
|
||||
|
||||
registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
|
||||
|
||||
@@ -205,12 +205,14 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
|
||||
|
||||
if (gEnv->rank == 0) {
|
||||
// Receiver
|
||||
auto curIter = mscclpp::makeUniqueCudaHost<uint64_t>(0);
|
||||
auto result = mscclpp::makeUniqueCudaHost<int>(0);
|
||||
auto curIter = mscclpp::detail::gpuCallocHostUnique<uint64_t>();
|
||||
auto result = mscclpp::detail::gpuCallocHostUnique<int>();
|
||||
|
||||
volatile uint64_t* ptrCurIter = (volatile uint64_t*)curIter.get();
|
||||
volatile int* ptrResult = (volatile int*)result.get();
|
||||
|
||||
ASSERT_NE(ptrCurIter, nullptr);
|
||||
ASSERT_NE(ptrResult, nullptr);
|
||||
ASSERT_EQ(*ptrCurIter, 0);
|
||||
ASSERT_EQ(*ptrResult, 0);
|
||||
|
||||
@@ -246,7 +248,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
|
||||
for (uint64_t i = 0; i < nelem; i++) {
|
||||
hostBuffer[i] = iter;
|
||||
}
|
||||
mscclpp::memcpyCuda<uint64_t>(data.get(), hostBuffer.data(), nelem, cudaMemcpyHostToDevice);
|
||||
mscclpp::gpuMemcpy<uint64_t>(data.get(), hostBuffer.data(), nelem, cudaMemcpyHostToDevice);
|
||||
|
||||
// Need to signal from time to time to empty the IB send queue
|
||||
bool signaled = (iter % signalPeriod == 0);
|
||||
@@ -303,7 +305,7 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
|
||||
|
||||
const int maxIter = 100000;
|
||||
const int nelem = 1;
|
||||
auto data = mscclpp::allocUniqueCuda<int>(nelem);
|
||||
auto data = mscclpp::detail::gpuCallocUnique<int>(nelem);
|
||||
|
||||
registerBufferAndConnect(data.get(), sizeof(int) * nelem);
|
||||
|
||||
|
||||
@@ -157,7 +157,7 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
|
||||
const int nElem = 4 * 1024 * 1024;
|
||||
|
||||
std::vector<mscclpp::ProxyChannel> proxyChannels;
|
||||
std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
|
||||
std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
|
||||
setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
|
||||
|
||||
std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
|
||||
@@ -169,7 +169,7 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
|
||||
|
||||
proxyService->startProxy();
|
||||
|
||||
std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
|
||||
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
|
||||
|
||||
const int nTries = 1000;
|
||||
|
||||
@@ -202,7 +202,7 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
|
||||
const int nElem = 4 * 1024 * 1024;
|
||||
|
||||
std::vector<mscclpp::ProxyChannel> proxyChannels;
|
||||
std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
|
||||
std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
|
||||
setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
|
||||
|
||||
std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
|
||||
@@ -214,7 +214,7 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
|
||||
|
||||
proxyService->startProxy();
|
||||
|
||||
std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
|
||||
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
|
||||
|
||||
auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
|
||||
const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
|
||||
@@ -344,11 +344,11 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
|
||||
const int nElem = 4 * 1024 * 1024;
|
||||
|
||||
std::vector<mscclpp::ProxyChannel> proxyChannels;
|
||||
std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
|
||||
std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
|
||||
|
||||
const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
|
||||
auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
|
||||
auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
|
||||
auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
|
||||
auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
|
||||
|
||||
setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
|
||||
nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
|
||||
@@ -368,7 +368,7 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
|
||||
|
||||
proxyService->startProxy();
|
||||
|
||||
std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
|
||||
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
|
||||
|
||||
const int nTries = 1000;
|
||||
|
||||
@@ -411,11 +411,11 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
|
||||
const int nElem = 4 * 1024 * 1024;
|
||||
|
||||
std::vector<mscclpp::ProxyChannel> proxyChannels;
|
||||
std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
|
||||
std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
|
||||
|
||||
const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
|
||||
auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
|
||||
auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
|
||||
auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
|
||||
auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
|
||||
|
||||
setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
|
||||
nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
|
||||
|
||||
@@ -77,8 +77,8 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
|
||||
const int defaultNTries = 1000;
|
||||
|
||||
std::vector<mscclpp::SmChannel> smChannels;
|
||||
std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
|
||||
std::shared_ptr<int> intermBuff = mscclpp::allocExtSharedCuda<int>(nElem * 2);
|
||||
std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
|
||||
std::shared_ptr<int> intermBuff = mscclpp::GpuBuffer<int>(nElem * 2).memory();
|
||||
setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
|
||||
std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
|
||||
std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
|
||||
@@ -88,7 +88,7 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
|
||||
sizeof(DeviceHandle<mscclpp::SmChannel>)));
|
||||
|
||||
std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
|
||||
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
|
||||
|
||||
// The least nelem is 2 for packet ping pong
|
||||
kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
|
||||
@@ -178,7 +178,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
|
||||
const int nElem = 4 * 1024 * 1024;
|
||||
|
||||
std::vector<mscclpp::SmChannel> smChannels;
|
||||
std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
|
||||
std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
|
||||
setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
|
||||
std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
|
||||
std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
|
||||
@@ -188,7 +188,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
|
||||
sizeof(DeviceHandle<mscclpp::SmChannel>)));
|
||||
|
||||
std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
|
||||
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
|
||||
|
||||
kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
@@ -257,7 +257,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
|
||||
const int nElem = 4 * 1024 * 1024;
|
||||
|
||||
std::vector<mscclpp::SmChannel> smChannels;
|
||||
std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
|
||||
std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
|
||||
setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
|
||||
std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
|
||||
std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
|
||||
@@ -267,7 +267,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
|
||||
sizeof(DeviceHandle<mscclpp::SmChannel>)));
|
||||
|
||||
std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
|
||||
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
|
||||
|
||||
kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <mscclpp/concurrency_device.hpp>
|
||||
#include <string>
|
||||
|
||||
@@ -711,13 +712,13 @@ class AllGatherTestEngine : public BaseTestEngine {
|
||||
AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}
|
||||
|
||||
void AllGatherTestEngine::allocateBuffer() {
|
||||
sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
|
||||
if (args_.kernelNum == 7) {
|
||||
const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
|
||||
// 2x for double-buffering, scratchBuff used to store original data and reduced results
|
||||
const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
|
||||
scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
|
||||
scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <mscclpp/concurrency_device.hpp>
|
||||
#include <mscclpp/packet_device.hpp>
|
||||
#include <vector>
|
||||
@@ -1272,30 +1273,30 @@ bool AllReduceTestEngine::isInPlace() const {
|
||||
}
|
||||
|
||||
void AllReduceTestEngine::allocateBuffer() {
|
||||
inputBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
resultBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
inputBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
resultBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
inputBuff = inputBuff_.get();
|
||||
resultBuff = resultBuff_.get();
|
||||
|
||||
if (args_.kernelNum == 0 || args_.kernelNum == 1 || args_.kernelNum == 3 || args_.kernelNum == 4) {
|
||||
scratchBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
scratchBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
scratchBuff = scratchBuff_.get();
|
||||
} else if (args_.kernelNum == 2) {
|
||||
const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
|
||||
// 2x for double-buffering
|
||||
const size_t scratchBuffNelem = nPacket * std::max(args_.nRanksPerNode - 1, 1) * 2;
|
||||
scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
|
||||
scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
|
||||
scratchPacketBuff = scratchPacketBuff_.get();
|
||||
const size_t packetBuffNelem = nPacket * 2;
|
||||
putPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
|
||||
getPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
|
||||
putPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(packetBuffNelem).memory();
|
||||
getPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(packetBuffNelem).memory();
|
||||
putPacketBuff = putPacketBuff_.get();
|
||||
getPacketBuff = getPacketBuff_.get();
|
||||
} else if (args_.kernelNum == 6 || args_.kernelNum == 7) {
|
||||
const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
|
||||
// 2x for double-buffering, scratchBuff used to store original data and reduced results
|
||||
const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
|
||||
scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
|
||||
scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
|
||||
scratchPacketBuff = scratchPacketBuff_.get();
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <mscclpp/concurrency_device.hpp>
|
||||
|
||||
#include "common.hpp"
|
||||
@@ -139,8 +140,8 @@ class AllToAllTestEngine : public BaseTestEngine {
|
||||
AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoall") { inPlace_ = false; }
|
||||
|
||||
void AllToAllTestEngine::allocateBuffer() {
|
||||
sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
recvBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
recvBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
|
||||
|
||||
localSendBuff = sendBuff_.get();
|
||||
|
||||
@@ -137,8 +137,8 @@ class SendRecvTestEngine : public BaseTestEngine {
|
||||
SendRecvTestEngine::SendRecvTestEngine(const TestArgs& args) : BaseTestEngine(args, "sendrecv") { inPlace_ = false; }
|
||||
|
||||
void SendRecvTestEngine::allocateBuffer() {
|
||||
std::shared_ptr<int> sendBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
std::shared_ptr<int> recvBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
|
||||
std::shared_ptr<int> sendBuff = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
std::shared_ptr<int> recvBuff = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
devicePtrs_.push_back(sendBuff);
|
||||
devicePtrs_.push_back(recvBuff);
|
||||
|
||||
|
||||
@@ -6,23 +6,23 @@
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
|
||||
TEST(CudaUtilsTest, AllocShared) {
|
||||
auto p1 = mscclpp::allocSharedCuda<uint32_t>();
|
||||
auto p2 = mscclpp::allocSharedCuda<int64_t>(5);
|
||||
auto p1 = mscclpp::detail::gpuCallocShared<uint32_t>();
|
||||
auto p2 = mscclpp::detail::gpuCallocShared<int64_t>(5);
|
||||
}
|
||||
|
||||
TEST(CudaUtilsTest, AllocUnique) {
|
||||
auto p1 = mscclpp::allocUniqueCuda<uint32_t>();
|
||||
auto p2 = mscclpp::allocUniqueCuda<int64_t>(5);
|
||||
auto p1 = mscclpp::detail::gpuCallocUnique<uint32_t>();
|
||||
auto p2 = mscclpp::detail::gpuCallocUnique<int64_t>(5);
|
||||
}
|
||||
|
||||
TEST(CudaUtilsTest, MakeSharedHost) {
|
||||
auto p1 = mscclpp::makeSharedCudaHost<uint32_t>();
|
||||
auto p2 = mscclpp::makeSharedCudaHost<int64_t>(5);
|
||||
auto p1 = mscclpp::detail::gpuCallocHostShared<uint32_t>();
|
||||
auto p2 = mscclpp::detail::gpuCallocHostShared<int64_t>(5);
|
||||
}
|
||||
|
||||
TEST(CudaUtilsTest, MakeUniqueHost) {
|
||||
auto p1 = mscclpp::makeUniqueCudaHost<uint32_t>();
|
||||
auto p2 = mscclpp::makeUniqueCudaHost<int64_t>(5);
|
||||
auto p1 = mscclpp::detail::gpuCallocHostUnique<uint32_t>();
|
||||
auto p2 = mscclpp::detail::gpuCallocHostUnique<int64_t>(5);
|
||||
}
|
||||
|
||||
TEST(CudaUtilsTest, Memcpy) {
|
||||
@@ -32,9 +32,9 @@ TEST(CudaUtilsTest, Memcpy) {
|
||||
hostBuff[i] = i + 1;
|
||||
}
|
||||
std::vector<int> hostBuffTmp(nElem, 0);
|
||||
auto devBuff = mscclpp::allocSharedCuda<int>(nElem);
|
||||
mscclpp::memcpyCuda<int>(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice);
|
||||
mscclpp::memcpyCuda<int>(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost);
|
||||
auto devBuff = mscclpp::detail::gpuCallocShared<int>(nElem);
|
||||
mscclpp::gpuMemcpy<int>(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice);
|
||||
mscclpp::gpuMemcpy<int>(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost);
|
||||
|
||||
for (int i = 0; i < nElem; ++i) {
|
||||
EXPECT_EQ(hostBuff[i], hostBuffTmp[i]);
|
||||
|
||||
Reference in New Issue
Block a user