// Copyright (c) Microsoft Corporation. // Licensed under the MIT license. #ifndef MSCCLPP_GPU_HPP_ #define MSCCLPP_GPU_HPP_ #include #if defined(MSCCLPP_DEVICE_HIP) using cudaError_t = hipError_t; using cudaEvent_t = hipEvent_t; using cudaGraph_t = hipGraph_t; using cudaGraphExec_t = hipGraphExec_t; using cudaDeviceProp = hipDeviceProp_t; using cudaStream_t = hipStream_t; using cudaStreamCaptureMode = hipStreamCaptureMode; using cudaMemcpyKind = hipMemcpyKind; using cudaIpcMemHandle_t = hipIpcMemHandle_t; using CUresult = hipError_t; using CUdeviceptr = hipDeviceptr_t; using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t; using CUmemAllocationProp = hipMemAllocationProp; using CUmemAccessDesc = hipMemAccessDesc; using CUmemAllocationHandleType = hipMemAllocationHandleType; using CUmemAllocationGranularity_flags = hipMemAllocationGranularity_flags; using CUmemorytype = hipMemoryType; constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled; constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed; constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice; constexpr auto cudaSuccess = hipSuccess; constexpr auto cudaErrorNotSupported = hipErrorNotSupported; constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking; constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal; constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed; constexpr auto cudaHostAllocMapped = hipHostMallocMapped; constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined; constexpr auto cudaMemcpyDefault = hipMemcpyDefault; constexpr auto cudaMemcpyDeviceToDevice = hipMemcpyDeviceToDevice; constexpr auto cudaMemcpyHostToDevice = hipMemcpyHostToDevice; constexpr auto cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost; constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess; constexpr auto cudaDevAttrComputeCapabilityMajor = hipDeviceAttributeComputeCapabilityMajor; constexpr auto cudaDevAttrComputeCapabilityMinor = hipDeviceAttributeComputeCapabilityMinor; constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned; constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice; constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor; constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite; constexpr auto CU_MEM_ALLOC_GRANULARITY_MINIMUM = hipMemAllocationGranularityMinimum; constexpr auto CU_MEMORYTYPE_DEVICE = hipMemoryTypeDevice; constexpr auto CU_POINTER_ATTRIBUTE_MEMORY_TYPE = HIP_POINTER_ATTRIBUTE_MEMORY_TYPE; constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL; #ifndef CUDA_SUCCESS #define CUDA_SUCCESS hipSuccess #endif // CUDA_SUCCESS #define CUDA_ERROR_DEINITIALIZED hipErrorDeinitialized #define CUDA_ERROR_CONTEXT_IS_DESTROYED hipErrorContextIsDestroyed #define CUDA_ERROR_LAUNCH_FAILED hipErrorLaunchFailure #define CUDA_ERROR_NOT_SUPPORTED hipErrorNotSupported #define CUDA_ERROR_INVALID_VALUE hipErrorInvalidValue #define cudaEventCreate(...) hipEventCreate(__VA_ARGS__) #define cudaEventCreateWithFlags(...) hipEventCreateWithFlags(__VA_ARGS__) #define cudaEventDestroy(...) hipEventDestroy(__VA_ARGS__) #define cudaEventRecord(...) hipEventRecord(__VA_ARGS__) #define cudaEventSynchronize(...) hipEventSynchronize(__VA_ARGS__) #define cudaEventElapsedTime(...) hipEventElapsedTime(__VA_ARGS__) #define cudaGetErrorString(...) hipGetErrorString(__VA_ARGS__) #define cudaGetDevice(...) hipGetDevice(__VA_ARGS__) #define cudaGetDeviceCount(...) hipGetDeviceCount(__VA_ARGS__) #define cudaGetDeviceProperties(...) hipGetDeviceProperties(__VA_ARGS__) #define cudaDeviceGetAttribute(...) hipDeviceGetAttribute(__VA_ARGS__) #define cudaGetLastError(...) hipGetLastError(__VA_ARGS__) #define cudaSetDevice(...) hipSetDevice(__VA_ARGS__) #define cudaDeviceSynchronize(...) hipDeviceSynchronize(__VA_ARGS__) #define cudaDeviceGetPCIBusId(...) hipDeviceGetPCIBusId(__VA_ARGS__) #define cudaDeviceCanAccessPeer(...) hipDeviceCanAccessPeer(__VA_ARGS__) #define cudaDeviceEnablePeerAccess(...) hipDeviceEnablePeerAccess(__VA_ARGS__) #define cudaHostAlloc(...) hipHostMalloc(__VA_ARGS__) #define cudaMalloc(...) hipMalloc(__VA_ARGS__) #define cudaFree(...) hipFree(__VA_ARGS__) #define cudaFreeHost(...) hipHostFree(__VA_ARGS__) #define cudaMemset(...) hipMemset(__VA_ARGS__) #define cudaMemsetAsync(...) hipMemsetAsync(__VA_ARGS__) #define cudaMemcpy(...) hipMemcpy(__VA_ARGS__) #define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__) #define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__) #define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__) #define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__) #define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__) #define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__) #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__) #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__) #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__) #define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__) #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__) #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__) #define cudaGraphDestroy(...) hipGraphDestroy(__VA_ARGS__) #define cudaGraphExecDestroy(...) hipGraphExecDestroy(__VA_ARGS__) #define cudaThreadExchangeStreamCaptureMode(...) hipThreadExchangeStreamCaptureMode(__VA_ARGS__) #define cudaIpcGetMemHandle(...) hipIpcGetMemHandle(__VA_ARGS__) #define cudaIpcOpenMemHandle(...) hipIpcOpenMemHandle(__VA_ARGS__) #define cudaIpcCloseMemHandle(...) hipIpcCloseMemHandle(__VA_ARGS__) #define cuGetErrorString(...) hipDrvGetErrorString(__VA_ARGS__) #define cuMemAddressReserve(...) hipMemAddressReserve(__VA_ARGS__) #define cuMemAddressFree(...) hipMemAddressFree(__VA_ARGS__) #define cuMemGetAddressRange(...) hipMemGetAddressRange(__VA_ARGS__) #define cuMemCreate(...) hipMemCreate(__VA_ARGS__) #define cuMemRelease(...) hipMemRelease(__VA_ARGS__) #define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__) #define cuMemMap(...) hipMemMap(__VA_ARGS__) #define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__) #define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__) #define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__) #define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__) #define cuMemGetAllocationGranularity(...) hipMemGetAllocationGranularity(__VA_ARGS__) #define cuPointerGetAttribute(...) hipPointerGetAttribute(__VA_ARGS__) #else // !defined(MSCCLPP_DEVICE_HIP) #include #include #endif // !defined(MSCCLPP_DEVICE_HIP) // NVLS #if !defined(MSCCLPP_DEVICE_HIP) #include #if CUDART_VERSION < 12030 #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL) #endif // We need CUDA 12.3 above and kernel 5.6.0 above for NVLS API #define CUDA_NVLS_API_AVAILABLE ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))) #else // defined(MSCCLPP_DEVICE_HIP) #define CUDA_NVLS_API_AVAILABLE 0 // NVLS is not supported on AMD platform, just to avoid compilation error #define CU_MEM_HANDLE_TYPE_FABRIC ((hipMemAllocationHandleType)0x8ULL) #endif // defined(MSCCLPP_DEVICE_HIP) // GPU sync threads #if defined(MSCCLPP_DEVICE_HIP) #define __syncshm() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier"); #else #define __syncshm() __syncthreads(); #endif #endif // MSCCLPP_GPU_HPP_