mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
This PR refactors the algorithm selection logic in MSCCL++ and introduces support for symmetric memory configuration through environment variables. 1. Algorithm Selection Refactoring Use separate class for algo selection. Could introduce more complex logic for algo selection based on message size, arch, if cuda graph is enabled and memory allocation method 2. Symmetric Memory Support Introduced symmetricMemory parameter in algorithm context key generation. Remove disableChannelCache env as is ambiguous 3. Add new args for build_default_algorithms Add flag_buffer, and flag_buffer_size args to build default algorithm. Then we could use unified flag buffer for different algorithms, avoid application hanging when switch algo for different message size. --------- Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com> Co-authored-by: Qinghua Zhou <qinghuazhou@microsoft.com> Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
160 lines
7.8 KiB
C++
160 lines
7.8 KiB
C++
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#ifndef MSCCLPP_GPU_HPP_
|
|
#define MSCCLPP_GPU_HPP_
|
|
|
|
#include <mscclpp/device.hpp>
|
|
|
|
#if defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
using cudaError_t = hipError_t;
|
|
using cudaEvent_t = hipEvent_t;
|
|
using cudaGraph_t = hipGraph_t;
|
|
using cudaGraphExec_t = hipGraphExec_t;
|
|
using cudaDeviceProp = hipDeviceProp_t;
|
|
using cudaStream_t = hipStream_t;
|
|
using cudaStreamCaptureMode = hipStreamCaptureMode;
|
|
using cudaStreamCaptureStatus = hipStreamCaptureStatus;
|
|
using cudaMemcpyKind = hipMemcpyKind;
|
|
using cudaIpcMemHandle_t = hipIpcMemHandle_t;
|
|
|
|
using CUresult = hipError_t;
|
|
using CUdeviceptr = hipDeviceptr_t;
|
|
using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
|
|
using CUmemAllocationProp = hipMemAllocationProp;
|
|
using CUmemAccessDesc = hipMemAccessDesc;
|
|
using CUmemAllocationHandleType = hipMemAllocationHandleType;
|
|
using CUmemAllocationGranularity_flags = hipMemAllocationGranularity_flags;
|
|
using CUmemorytype = hipMemoryType;
|
|
|
|
constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
|
|
constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
|
|
constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice;
|
|
constexpr auto cudaSuccess = hipSuccess;
|
|
constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
|
|
constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
|
|
constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
|
|
constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
|
|
constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone;
|
|
constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive;
|
|
constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated;
|
|
constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
|
|
constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
|
|
constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
|
|
constexpr auto cudaMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
|
|
constexpr auto cudaMemcpyHostToDevice = hipMemcpyHostToDevice;
|
|
constexpr auto cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost;
|
|
constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess;
|
|
|
|
constexpr auto cudaDevAttrComputeCapabilityMajor = hipDeviceAttributeComputeCapabilityMajor;
|
|
constexpr auto cudaDevAttrComputeCapabilityMinor = hipDeviceAttributeComputeCapabilityMinor;
|
|
|
|
constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned;
|
|
constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice;
|
|
constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor;
|
|
constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite;
|
|
constexpr auto CU_MEM_ALLOC_GRANULARITY_MINIMUM = hipMemAllocationGranularityMinimum;
|
|
constexpr auto CU_MEMORYTYPE_DEVICE = hipMemoryTypeDevice;
|
|
|
|
constexpr auto CU_POINTER_ATTRIBUTE_MEMORY_TYPE = HIP_POINTER_ATTRIBUTE_MEMORY_TYPE;
|
|
constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL;
|
|
|
|
#ifndef CUDA_SUCCESS
|
|
#define CUDA_SUCCESS hipSuccess
|
|
#endif // CUDA_SUCCESS
|
|
#define CUDA_ERROR_DEINITIALIZED hipErrorDeinitialized
|
|
#define CUDA_ERROR_CONTEXT_IS_DESTROYED hipErrorContextIsDestroyed
|
|
#define CUDA_ERROR_LAUNCH_FAILED hipErrorLaunchFailure
|
|
#define CUDA_ERROR_NOT_SUPPORTED hipErrorNotSupported
|
|
#define CUDA_ERROR_INVALID_VALUE hipErrorInvalidValue
|
|
|
|
#define cudaEventCreate(...) hipEventCreate(__VA_ARGS__)
|
|
#define cudaEventCreateWithFlags(...) hipEventCreateWithFlags(__VA_ARGS__)
|
|
#define cudaEventDestroy(...) hipEventDestroy(__VA_ARGS__)
|
|
#define cudaEventRecord(...) hipEventRecord(__VA_ARGS__)
|
|
#define cudaEventSynchronize(...) hipEventSynchronize(__VA_ARGS__)
|
|
#define cudaEventElapsedTime(...) hipEventElapsedTime(__VA_ARGS__)
|
|
#define cudaGetErrorString(...) hipGetErrorString(__VA_ARGS__)
|
|
#define cudaGetDevice(...) hipGetDevice(__VA_ARGS__)
|
|
#define cudaGetDeviceCount(...) hipGetDeviceCount(__VA_ARGS__)
|
|
#define cudaGetDeviceProperties(...) hipGetDeviceProperties(__VA_ARGS__)
|
|
#define cudaDeviceGetAttribute(...) hipDeviceGetAttribute(__VA_ARGS__)
|
|
#define cudaGetLastError(...) hipGetLastError(__VA_ARGS__)
|
|
#define cudaSetDevice(...) hipSetDevice(__VA_ARGS__)
|
|
#define cudaDeviceSynchronize(...) hipDeviceSynchronize(__VA_ARGS__)
|
|
#define cudaDeviceGetPCIBusId(...) hipDeviceGetPCIBusId(__VA_ARGS__)
|
|
#define cudaDeviceCanAccessPeer(...) hipDeviceCanAccessPeer(__VA_ARGS__)
|
|
#define cudaDeviceEnablePeerAccess(...) hipDeviceEnablePeerAccess(__VA_ARGS__)
|
|
#define cudaHostAlloc(...) hipHostMalloc(__VA_ARGS__)
|
|
#define cudaMalloc(...) hipMalloc(__VA_ARGS__)
|
|
#define cudaFree(...) hipFree(__VA_ARGS__)
|
|
#define cudaFreeHost(...) hipHostFree(__VA_ARGS__)
|
|
#define cudaMemset(...) hipMemset(__VA_ARGS__)
|
|
#define cudaMemsetAsync(...) hipMemsetAsync(__VA_ARGS__)
|
|
#define cudaMemcpy(...) hipMemcpy(__VA_ARGS__)
|
|
#define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__)
|
|
#define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__)
|
|
#define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__)
|
|
#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__)
|
|
#define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__)
|
|
#define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__)
|
|
#define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
|
|
#define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
|
|
#define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
|
|
#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__)
|
|
#define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
|
|
#define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
|
|
#define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
|
|
#define cudaGraphDestroy(...) hipGraphDestroy(__VA_ARGS__)
|
|
#define cudaGraphExecDestroy(...) hipGraphExecDestroy(__VA_ARGS__)
|
|
#define cudaThreadExchangeStreamCaptureMode(...) hipThreadExchangeStreamCaptureMode(__VA_ARGS__)
|
|
#define cudaIpcGetMemHandle(...) hipIpcGetMemHandle(__VA_ARGS__)
|
|
#define cudaIpcOpenMemHandle(...) hipIpcOpenMemHandle(__VA_ARGS__)
|
|
#define cudaIpcCloseMemHandle(...) hipIpcCloseMemHandle(__VA_ARGS__)
|
|
|
|
#define cuGetErrorString(...) hipDrvGetErrorString(__VA_ARGS__)
|
|
#define cuMemAddressReserve(...) hipMemAddressReserve(__VA_ARGS__)
|
|
#define cuMemAddressFree(...) hipMemAddressFree(__VA_ARGS__)
|
|
#define cuMemGetAddressRange(...) hipMemGetAddressRange(__VA_ARGS__)
|
|
#define cuMemCreate(...) hipMemCreate(__VA_ARGS__)
|
|
#define cuMemRelease(...) hipMemRelease(__VA_ARGS__)
|
|
#define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__)
|
|
#define cuMemMap(...) hipMemMap(__VA_ARGS__)
|
|
#define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__)
|
|
#define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__)
|
|
#define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__)
|
|
#define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__)
|
|
#define cuMemGetAllocationGranularity(...) hipMemGetAllocationGranularity(__VA_ARGS__)
|
|
#define cuPointerGetAttribute(...) hipPointerGetAttribute(__VA_ARGS__)
|
|
|
|
#else // !defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#endif // !defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
// NVLS
|
|
#if !defined(MSCCLPP_DEVICE_HIP)
|
|
#include <linux/version.h>
|
|
#if CUDART_VERSION < 12030
|
|
#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
|
|
#endif
|
|
// We need CUDA 12.3 above and kernel 5.6.0 above for NVLS API
|
|
#define CUDA_NVLS_API_AVAILABLE ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
|
|
#else // defined(MSCCLPP_DEVICE_HIP)
|
|
#define CUDA_NVLS_API_AVAILABLE 0
|
|
// NVLS is not supported on AMD platform, just to avoid compilation error
|
|
#define CU_MEM_HANDLE_TYPE_FABRIC ((hipMemAllocationHandleType)0x8ULL)
|
|
#endif // defined(MSCCLPP_DEVICE_HIP)
|
|
|
|
// GPU sync threads
|
|
#if defined(MSCCLPP_DEVICE_HIP)
|
|
#define __syncshm() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
|
|
#else
|
|
#define __syncshm() __syncthreads();
|
|
#endif
|
|
|
|
#endif // MSCCLPP_GPU_HPP_
|