Files
mscclpp/src/core/gpu_utils.cc
Changho Hwang 42be3660e0 Add a new IB stack impl that doesn't use RDMA atomics (#728)
* Added configurable InfiniBand (IB) signaling mode.
`EndpointConfig::Ib::Mode` enum selects the mode (`Default`, `Host`,
`HostNoAtomic`). `Default` is equivalent to `Host` unless specified
different by envrionment `MSCCLPP_IBV_MODE`. `Host` corresponds to the
previous implementation using RDMA atomics for signaling, while
`HostNoAtomic` uses write-with-immediate instead.
* Regarding updates in Python bindings and API.
2026-02-10 01:07:53 +00:00

311 lines
10 KiB
C++

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include <cstring>
#include <mscclpp/gpu.hpp>
#include <mscclpp/gpu_utils.hpp>
#include "gpu_utils_internal.hpp"
namespace mscclpp {
AvoidCudaGraphCaptureGuard::AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed), active_(true) {
cudaError_t res = cudaThreadExchangeStreamCaptureMode(&mode_);
if (isCudaTeardownError(res)) {
// Runtime is going away; just mark inactive so destructor skips restoring.
active_ = false;
(void)cudaGetLastError();
} else {
MSCCLPP_CUDATHROW(res);
}
}
AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() {
if (!active_) return;
(void)cudaThreadExchangeStreamCaptureMode(&mode_);
}
CudaDeviceGuard::CudaDeviceGuard(int deviceId) : deviceId_(deviceId), origDeviceId_(-1) {
if (deviceId_ >= 0) {
MSCCLPP_CUDATHROW(cudaGetDevice(&origDeviceId_));
if (origDeviceId_ != deviceId_) {
MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));
}
}
}
CudaDeviceGuard::~CudaDeviceGuard() {
if (deviceId_ >= 0 && origDeviceId_ >= 0 && origDeviceId_ != deviceId_) {
(void)cudaSetDevice(origDeviceId_);
}
}
CudaStreamWithFlags::CudaStreamWithFlags() : stream_(nullptr) { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_)); }
CudaStreamWithFlags::CudaStreamWithFlags(unsigned int flags) {
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags));
}
CudaStreamWithFlags::~CudaStreamWithFlags() {
if (!empty()) (void)cudaStreamDestroy(stream_);
}
void CudaStreamWithFlags::set(unsigned int flags) {
if (!empty()) throw Error("CudaStreamWithFlags already set", ErrorCode::InvalidUsage);
CudaDeviceGuard deviceGuard(deviceId_);
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags));
}
bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; }
GpuStream::GpuStream(std::shared_ptr<GpuStreamPool> pool, std::shared_ptr<CudaStreamWithFlags> stream)
: pool_(pool), stream_(stream) {}
GpuStream::~GpuStream() { pool_->streams_[deviceId()].push_back(stream_); }
GpuStreamPool::GpuStreamPool() {}
GpuStream GpuStreamPool::getStream() {
int deviceId;
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
auto& streamVec = streams_[deviceId];
if (!streamVec.empty()) {
auto stream = streamVec.back();
streamVec.pop_back();
return GpuStream(gpuStreamPool(), stream);
}
return GpuStream(gpuStreamPool(), std::make_shared<CudaStreamWithFlags>(cudaStreamNonBlocking));
}
void GpuStreamPool::clear() { streams_.clear(); }
// A global pool instance
static std::shared_ptr<GpuStreamPool> gGpuStreamPool_;
std::shared_ptr<GpuStreamPool> gpuStreamPool() {
if (!gGpuStreamPool_) {
gGpuStreamPool_ = std::make_shared<GpuStreamPool>();
}
return gGpuStreamPool_;
}
namespace detail {
int gpuIdFromAddress(void* ptr) {
int deviceId;
auto res = cuPointerGetAttribute(&deviceId, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, reinterpret_cast<CUdeviceptr>(ptr));
if (res == CUDA_ERROR_INVALID_VALUE) {
// not a GPU address
return -1;
} else {
MSCCLPP_CUTHROW(res);
}
return deviceId;
}
/// set memory access permission to read-write
/// @param base Base memory pointer.
/// @param size Size of the memory.
void setReadWriteMemoryAccess(void* base, size_t size) {
CUmemAccessDesc accessDesc = {};
int deviceId;
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = deviceId;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
}
void* gpuCalloc(size_t bytes) {
AvoidCudaGraphCaptureGuard cgcGuard;
void* ptr;
auto stream = gpuStreamPool()->getStream();
MSCCLPP_CUDATHROW(cudaMalloc(&ptr, bytes));
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
return ptr;
}
void* gpuCallocHost(size_t bytes, unsigned int flags) {
AvoidCudaGraphCaptureGuard cgcGuard;
void* ptr;
MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, flags));
::memset(ptr, 0, bytes);
return ptr;
}
#if defined(MSCCLPP_USE_ROCM)
void* gpuCallocUncached(size_t bytes) {
AvoidCudaGraphCaptureGuard cgcGuard;
void* ptr;
auto stream = gpuStreamPool()->getStream();
MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, bytes, hipDeviceMallocUncached));
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
return ptr;
}
#endif // defined(MSCCLPP_USE_ROCM)
#if (CUDA_NVLS_API_AVAILABLE)
size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag) {
size_t gran = 0;
int deviceId = -1;
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = deviceId;
prop.requestedHandleTypes =
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
cuMemGetAllocationGranularity(&gran, &prop, granFlag);
return gran;
}
size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
size_t gran = 0;
int numDevices = 0;
MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
CUmulticastObjectProp prop = {};
prop.size = size;
// This is a dummy value, it might affect the granularity in the future
prop.numDevices = numDevices;
prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
prop.flags = 0;
MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
return gran;
}
void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
AvoidCudaGraphCaptureGuard cgcGuard;
int deviceId = -1;
CUdevice currentDevice;
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
int isFabricSupported;
MSCCLPP_CUTHROW(
cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDevice));
if (isFabricSupported) {
requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
}
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes = (CUmemAllocationHandleType)(requestedHandleTypes);
prop.location.id = currentDevice;
if (gran == 0) {
gran = getMulticastGranularity(bytes, CU_MULTICAST_GRANULARITY_RECOMMENDED);
}
// allocate physical memory
CUmemGenericAllocationHandle memHandle;
size_t nbytes = (bytes + gran - 1) / gran * gran;
CUresult result = cuMemCreate(&memHandle, nbytes, &prop, 0);
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC &&
(result == CUDA_ERROR_NOT_PERMITTED || result == CUDA_ERROR_NOT_SUPPORTED)) {
requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
prop.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes;
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0));
} else {
MSCCLPP_CUTHROW(result);
}
if (align == 0) {
align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
}
void* devicePtr = nullptr;
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, align, 0U, 0));
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
setReadWriteMemoryAccess(devicePtr, nbytes);
auto stream = gpuStreamPool()->getStream();
MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
return devicePtr;
}
#endif // CUDA_NVLS_API_AVAILABLE
void gpuFree(void* ptr) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFree(ptr));
}
void gpuFreeHost(void* ptr) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFreeHost(ptr));
}
#if (CUDA_NVLS_API_AVAILABLE)
void gpuFreePhysical(void* ptr) {
AvoidCudaGraphCaptureGuard cgcGuard;
CUmemGenericAllocationHandle handle;
size_t size = 0;
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRetainAllocationHandle(&handle, ptr));
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRelease(handle));
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
MSCCLPP_CUTHROW_IGNORE(cuMemUnmap((CUdeviceptr)ptr, size));
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRelease(handle));
MSCCLPP_CUTHROW_IGNORE(cuMemAddressFree((CUdeviceptr)ptr, size));
}
#endif // CUDA_NVLS_API_AVAILABLE
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind) {
AvoidCudaGraphCaptureGuard cgcGuard;
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
}
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
AvoidCudaGraphCaptureGuard cgcGuard;
CudaStreamWithFlags stream(cudaStreamNonBlocking);
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
}
} // namespace detail
bool isNvlsSupported() {
if (env()->forceDisableNvls) {
return false;
}
[[maybe_unused]] static bool result = false;
[[maybe_unused]] static bool isChecked = false;
#if (CUDA_NVLS_API_AVAILABLE)
if (!isChecked) {
int deviceId;
int isMulticastSupported;
CUdevice dev;
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId));
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
return isMulticastSupported == 1;
}
return result;
#endif
return false;
}
bool isCuMemMapAllocated([[maybe_unused]] void* ptr) {
#if defined(MSCCLPP_USE_ROCM)
return false;
#else
CUmemGenericAllocationHandle handle;
CUresult result = cuMemRetainAllocationHandle(&handle, ptr);
if (result != CUDA_SUCCESS) {
return false;
}
MSCCLPP_CUTHROW(cuMemRelease(handle));
if (!isNvlsSupported()) {
throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage);
}
return true;
#endif
}
} // namespace mscclpp