mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 01:36:10 +00:00
Add `GpuIpcMemHandle` that is a generic GPU memory handle that covers all existing methods for GPU memory mapping. This PR fixes issues that fail to properly fallback to a feasible type of memory handle on the importing environment. It also consolidates code for creating or destroying various memory handles into a single RAII wrapper. --------- Co-authored-by: Binyang Li <binyli@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com>
155 lines
7.5 KiB
C++
155 lines
7.5 KiB
C++
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#ifndef MSCCLPP_GPU_HPP_
|
|
#define MSCCLPP_GPU_HPP_
|
|
|
|
#include <mscclpp/device.hpp>
|
|
|
|
#if defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
using cudaError_t = hipError_t;
|
|
using cudaEvent_t = hipEvent_t;
|
|
using cudaGraph_t = hipGraph_t;
|
|
using cudaGraphExec_t = hipGraphExec_t;
|
|
using cudaDeviceProp = hipDeviceProp_t;
|
|
using cudaStream_t = hipStream_t;
|
|
using cudaStreamCaptureMode = hipStreamCaptureMode;
|
|
using cudaMemcpyKind = hipMemcpyKind;
|
|
using cudaIpcMemHandle_t = hipIpcMemHandle_t;
|
|
|
|
using CUresult = hipError_t;
|
|
using CUdeviceptr = hipDeviceptr_t;
|
|
using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
|
|
using CUmemAllocationProp = hipMemAllocationProp;
|
|
using CUmemAccessDesc = hipMemAccessDesc;
|
|
using CUmemAllocationHandleType = hipMemAllocationHandleType;
|
|
using CUmemAllocationGranularity_flags = hipMemAllocationGranularity_flags;
|
|
using CUmemorytype = hipMemoryType;
|
|
|
|
constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
|
|
constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
|
|
constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice;
|
|
constexpr auto cudaSuccess = hipSuccess;
|
|
constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
|
|
constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
|
|
constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
|
|
constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
|
|
constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
|
|
constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
|
|
constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
|
|
constexpr auto cudaMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
|
|
constexpr auto cudaMemcpyHostToDevice = hipMemcpyHostToDevice;
|
|
constexpr auto cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost;
|
|
constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess;
|
|
|
|
constexpr auto cudaDevAttrComputeCapabilityMajor = hipDeviceAttributeComputeCapabilityMajor;
|
|
constexpr auto cudaDevAttrComputeCapabilityMinor = hipDeviceAttributeComputeCapabilityMinor;
|
|
|
|
constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned;
|
|
constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice;
|
|
constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor;
|
|
constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite;
|
|
constexpr auto CU_MEM_ALLOC_GRANULARITY_MINIMUM = hipMemAllocationGranularityMinimum;
|
|
constexpr auto CU_MEMORYTYPE_DEVICE = hipMemoryTypeDevice;
|
|
|
|
constexpr auto CU_POINTER_ATTRIBUTE_MEMORY_TYPE = HIP_POINTER_ATTRIBUTE_MEMORY_TYPE;
|
|
constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL;
|
|
|
|
#ifndef CUDA_SUCCESS
|
|
#define CUDA_SUCCESS hipSuccess
|
|
#endif // CUDA_SUCCESS
|
|
#define CUDA_ERROR_DEINITIALIZED hipErrorDeinitialized
|
|
#define CUDA_ERROR_CONTEXT_IS_DESTROYED hipErrorContextIsDestroyed
|
|
#define CUDA_ERROR_LAUNCH_FAILED hipErrorLaunchFailure
|
|
#define CUDA_ERROR_NOT_SUPPORTED hipErrorNotSupported
|
|
#define CUDA_ERROR_INVALID_VALUE hipErrorInvalidValue
|
|
|
|
#define cudaEventCreate(...) hipEventCreate(__VA_ARGS__)
|
|
#define cudaEventCreateWithFlags(...) hipEventCreateWithFlags(__VA_ARGS__)
|
|
#define cudaEventDestroy(...) hipEventDestroy(__VA_ARGS__)
|
|
#define cudaEventRecord(...) hipEventRecord(__VA_ARGS__)
|
|
#define cudaEventSynchronize(...) hipEventSynchronize(__VA_ARGS__)
|
|
#define cudaEventElapsedTime(...) hipEventElapsedTime(__VA_ARGS__)
|
|
#define cudaGetErrorString(...) hipGetErrorString(__VA_ARGS__)
|
|
#define cudaGetDevice(...) hipGetDevice(__VA_ARGS__)
|
|
#define cudaGetDeviceCount(...) hipGetDeviceCount(__VA_ARGS__)
|
|
#define cudaGetDeviceProperties(...) hipGetDeviceProperties(__VA_ARGS__)
|
|
#define cudaDeviceGetAttribute(...) hipDeviceGetAttribute(__VA_ARGS__)
|
|
#define cudaGetLastError(...) hipGetLastError(__VA_ARGS__)
|
|
#define cudaSetDevice(...) hipSetDevice(__VA_ARGS__)
|
|
#define cudaDeviceSynchronize(...) hipDeviceSynchronize(__VA_ARGS__)
|
|
#define cudaDeviceGetPCIBusId(...) hipDeviceGetPCIBusId(__VA_ARGS__)
|
|
#define cudaDeviceCanAccessPeer(...) hipDeviceCanAccessPeer(__VA_ARGS__)
|
|
#define cudaDeviceEnablePeerAccess(...) hipDeviceEnablePeerAccess(__VA_ARGS__)
|
|
#define cudaHostAlloc(...) hipHostMalloc(__VA_ARGS__)
|
|
#define cudaMalloc(...) hipMalloc(__VA_ARGS__)
|
|
#define cudaFree(...) hipFree(__VA_ARGS__)
|
|
#define cudaFreeHost(...) hipHostFree(__VA_ARGS__)
|
|
#define cudaMemset(...) hipMemset(__VA_ARGS__)
|
|
#define cudaMemsetAsync(...) hipMemsetAsync(__VA_ARGS__)
|
|
#define cudaMemcpy(...) hipMemcpy(__VA_ARGS__)
|
|
#define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__)
|
|
#define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__)
|
|
#define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__)
|
|
#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__)
|
|
#define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__)
|
|
#define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__)
|
|
#define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
|
|
#define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
|
|
#define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
|
|
#define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
|
|
#define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
|
|
#define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
|
|
#define cudaGraphDestroy(...) hipGraphDestroy(__VA_ARGS__)
|
|
#define cudaGraphExecDestroy(...) hipGraphExecDestroy(__VA_ARGS__)
|
|
#define cudaThreadExchangeStreamCaptureMode(...) hipThreadExchangeStreamCaptureMode(__VA_ARGS__)
|
|
#define cudaIpcGetMemHandle(...) hipIpcGetMemHandle(__VA_ARGS__)
|
|
#define cudaIpcOpenMemHandle(...) hipIpcOpenMemHandle(__VA_ARGS__)
|
|
#define cudaIpcCloseMemHandle(...) hipIpcCloseMemHandle(__VA_ARGS__)
|
|
|
|
#define cuGetErrorString(...) hipDrvGetErrorString(__VA_ARGS__)
|
|
#define cuMemAddressReserve(...) hipMemAddressReserve(__VA_ARGS__)
|
|
#define cuMemAddressFree(...) hipMemAddressFree(__VA_ARGS__)
|
|
#define cuMemGetAddressRange(...) hipMemGetAddressRange(__VA_ARGS__)
|
|
#define cuMemCreate(...) hipMemCreate(__VA_ARGS__)
|
|
#define cuMemRelease(...) hipMemRelease(__VA_ARGS__)
|
|
#define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__)
|
|
#define cuMemMap(...) hipMemMap(__VA_ARGS__)
|
|
#define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__)
|
|
#define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__)
|
|
#define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__)
|
|
#define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__)
|
|
#define cuMemGetAllocationGranularity(...) hipMemGetAllocationGranularity(__VA_ARGS__)
|
|
#define cuPointerGetAttribute(...) hipPointerGetAttribute(__VA_ARGS__)
|
|
|
|
#else // !defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#endif // !defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
// NVLS
|
|
#if !defined(MSCCLPP_DEVICE_HIP)
|
|
#include <linux/version.h>
|
|
#if CUDART_VERSION < 12030
|
|
#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
|
|
#endif
|
|
// We need CUDA 12.3 above and kernel 5.6.0 above for NVLS API
|
|
#define CUDA_NVLS_API_AVAILABLE ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
|
|
#else // defined(MSCCLPP_DEVICE_HIP)
|
|
#define CUDA_NVLS_API_AVAILABLE 0
|
|
// NVLS is not supported on AMD platform, just to avoid compilation error
|
|
#define CU_MEM_HANDLE_TYPE_FABRIC ((hipMemAllocationHandleType)0x8ULL)
|
|
#endif // defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
// GPU sync threads
|
|
#if defined(MSCCLPP_DEVICE_HIP)
|
|
#define __syncshm() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
|
|
#else
|
|
#define __syncshm() __syncthreads();
|
|
#endif
|
|
|
|
#endif // MSCCLPP_GPU_HPP_
|