mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 06:49:29 +00:00
NVLS support for msccl++ executor (#375)
- Support mote datatype for multicast operation - Add new OP MULTI_LOAD_REDUCE_STORE to support NVLS - Modify allocSharedPhysicalCuda, which return std::shared_ptr<T> instead of std::shared_ptr<PhysicalCudaMemory> - Add Python support for allocSharedPhysicalCuda Test passed for `allreduce_nvls.json`
This commit is contained in:
@@ -7,12 +7,12 @@
|
||||
#include <mscclpp/concurrency_device.hpp>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/gpu.hpp>
|
||||
#include <mscclpp/gpu_data_types.hpp>
|
||||
#include <mscclpp/packet_device.hpp>
|
||||
#include <mscclpp/sm_channel.hpp>
|
||||
#include <mscclpp/sm_channel_device.hpp>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "gpu_data_types.hpp"
|
||||
|
||||
template <typename To, typename From>
|
||||
__forceinline__ __device__ To bit_cast(const From& src) {
|
||||
|
||||
@@ -9,7 +9,7 @@ baseImageTable=(
|
||||
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
|
||||
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
|
||||
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
|
||||
["rocm6.2"]="rocm/rocm-terminal:6.2"
|
||||
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
|
||||
)
|
||||
|
||||
declare -A extraLdPathTable
|
||||
|
||||
@@ -19,6 +19,10 @@
|
||||
```
|
||||
lsmod | grep nvidia_peermem
|
||||
```
|
||||
* For GPU with nvls support, the IMEX channels should be set up (refer [cuMemCreate](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c)). You can set up the channels manually via:
|
||||
```
|
||||
sudo nvidia-modprobe -s -i <start:number of minors>
|
||||
```
|
||||
|
||||
## Build with Docker Images
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ using CUdeviceptr = hipDeviceptr_t;
|
||||
using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
|
||||
using CUmemAllocationProp = hipMemAllocationProp;
|
||||
using CUmemAccessDesc = hipMemAccessDesc;
|
||||
using CUmemAllocationHandleType = hipMemAllocationHandleType;
|
||||
|
||||
constexpr auto cudaSuccess = hipSuccess;
|
||||
constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
|
||||
@@ -86,6 +87,9 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
|
||||
#define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__)
|
||||
#define cuMemMap(...) hipMemMap(__VA_ARGS__)
|
||||
#define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__)
|
||||
#define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__)
|
||||
#define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__)
|
||||
#define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__)
|
||||
|
||||
#else
|
||||
|
||||
@@ -97,9 +101,10 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
|
||||
// NVLS
|
||||
#if !defined(__HIP_PLATFORM_AMD__)
|
||||
#include <linux/version.h>
|
||||
#define USE_NVLS ((CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
|
||||
// We need CU_MEM_HANDLE_TYPE_FABRIC (instroduced in cuda12.3) to support sharing handles across GPUs via sockets
|
||||
#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
|
||||
#else // !defined(__HIP_PLATFORM_AMD__)
|
||||
#define USE_NVLS 0
|
||||
#define CUDA_NVLS_SUPPORTED 0
|
||||
#endif // !defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
// GPU sync threads
|
||||
|
||||
@@ -16,6 +16,7 @@ using __bfloat162 = __hip_bfloat162;
|
||||
#else
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#if (CUDART_VERSION >= 11000)
|
||||
#include <cuda_bf16.h>
|
||||
#endif
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
#include "errors.hpp"
|
||||
#include "gpu.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
/// Throw @ref mscclpp::CudaError if @p cmd does not return cudaSuccess.
|
||||
/// @param cmd The command to execute.
|
||||
@@ -34,6 +35,19 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
/// set memory access permission to read-write
|
||||
/// @param base Base memory pointer.
|
||||
/// @param size Size of the memory.
|
||||
inline void setReadWriteMemoryAccess(void* base, size_t size) {
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
int deviceId;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = deviceId;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
|
||||
}
|
||||
|
||||
/// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
|
||||
/// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
|
||||
struct AvoidCudaGraphCaptureGuard {
|
||||
@@ -53,15 +67,6 @@ struct CudaStreamWithFlags {
|
||||
template <class T>
|
||||
struct CudaDeleter;
|
||||
|
||||
template <class T>
|
||||
struct PhysicalCudaMemory {
|
||||
CUmemGenericAllocationHandle memHandle_;
|
||||
T* devicePtr_;
|
||||
size_t size_;
|
||||
PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t size)
|
||||
: memHandle_(memHandle), devicePtr_(devicePtr), size_(size) {}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// A wrapper of cudaMalloc that sets the allocated memory to zero.
|
||||
@@ -79,46 +84,38 @@ T* cudaCalloc(size_t nelem) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
template <class T>
|
||||
PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
|
||||
T* cudaPhysicalCalloc(size_t nelems, size_t gran) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
|
||||
int deviceId = -1;
|
||||
CUdevice currentDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGet(¤tDevice, deviceId));
|
||||
|
||||
CUmemAllocationProp prop = {};
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.location.id = deviceId;
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
// TODO: revisit when HIP fixes this typo in the field name
|
||||
prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
#else
|
||||
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
#endif
|
||||
prop.requestedHandleTypes =
|
||||
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
prop.location.id = currentDevice;
|
||||
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
size_t bufferSize = sizeof(T) * nelem;
|
||||
// allocate physical memory
|
||||
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, bufferSize, &prop, 0 /*flags*/));
|
||||
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = deviceId;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
size_t nbytes = (nelems * sizeof(T) + gran - 1) / gran * gran;
|
||||
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
|
||||
|
||||
T* devicePtr = nullptr;
|
||||
// Map the device pointer
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
|
||||
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
|
||||
setReadWriteMemoryAccess(devicePtr, nbytes);
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, bufferSize, stream));
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
|
||||
return new PhysicalCudaMemory<T>(memHandle, devicePtr, bufferSize);
|
||||
return devicePtr;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
T* cudaExtCalloc(size_t nelem) {
|
||||
@@ -206,11 +203,15 @@ struct CudaDeleter {
|
||||
template <class T>
|
||||
struct CudaPhysicalDeleter {
|
||||
static_assert(!std::is_array_v<T>, "T must not be an array");
|
||||
void operator()(PhysicalCudaMemory<T>* ptr) {
|
||||
void operator()(T* ptr) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr->devicePtr_, ptr->size_));
|
||||
MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr->devicePtr_, ptr->size_));
|
||||
MSCCLPP_CUTHROW(cuMemRelease(ptr->memHandle_));
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
|
||||
MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
MSCCLPP_CUTHROW(cuMemRelease(handle));
|
||||
MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -234,16 +235,46 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
|
||||
}
|
||||
|
||||
/// Allocated physical memory on the device and returns a memory handle along with a memory handle for it.
|
||||
/// The deallocation only happens PhysicalCudaMemory goes out of scope.
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
|
||||
size_t gran = 0;
|
||||
int numDevices = 0;
|
||||
MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
|
||||
|
||||
CUmulticastObjectProp prop = {};
|
||||
prop.size = size;
|
||||
// This is a dummy value, it might affect the granularity in the future
|
||||
prop.numDevices = numDevices;
|
||||
prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
prop.flags = 0;
|
||||
MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
|
||||
return gran;
|
||||
}
|
||||
#endif
|
||||
|
||||
/// Allocates physical memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @param gran the granularity of the allocation.
|
||||
/// @return A std::shared_ptr to the memory handle and a device pointer for that memory.
|
||||
/// @return A std::shared_ptr to the allocated memory.
|
||||
template <class T>
|
||||
std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
|
||||
return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
|
||||
std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
|
||||
std::shared_ptr<T> allocSharedPhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
if (!isNvlsSupported()) {
|
||||
throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
|
||||
}
|
||||
if (count == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (gran == 0) {
|
||||
gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
|
||||
}
|
||||
size_t nelems = ((count * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
|
||||
return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>, std::shared_ptr<T>>(nelems, gran);
|
||||
#else
|
||||
throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
|
||||
@@ -269,18 +300,6 @@ UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
|
||||
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
|
||||
}
|
||||
|
||||
/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
|
||||
/// The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @param gran the granularity of the allocation.
|
||||
/// @return A std::unique_ptr to the memory handle and a device pointer for that memory.
|
||||
template <class T>
|
||||
std::unique_ptr<PhysicalCudaMemory<T>> allocUniquePhysicalCuda(size_t count, size_t gran) {
|
||||
return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
|
||||
std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
|
||||
}
|
||||
|
||||
/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
@@ -349,6 +368,32 @@ UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
|
||||
/// The memory is zeroed out.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param count Number of elements to allocate.
|
||||
/// @param gran the granularity of the allocation.
|
||||
/// @return A std::unique_ptr to the allocated memory.
|
||||
template <class T>
|
||||
std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
if (!isNvlsSupported()) {
|
||||
throw Error("Only suupport GPU with NVLS support", ErrorCode::InvalidUsage);
|
||||
}
|
||||
if (count == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (gran == 0) {
|
||||
gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
|
||||
}
|
||||
return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
|
||||
std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
|
||||
#else
|
||||
throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Asynchronous cudaMemcpy without capture into a CUDA graph.
|
||||
/// @tparam T Type of each element in the allocated memory.
|
||||
/// @param dst Destination pointer.
|
||||
|
||||
@@ -25,26 +25,26 @@ class NvlsConnection {
|
||||
|
||||
struct DeviceMulticastPointer {
|
||||
private:
|
||||
std::shared_ptr<PhysicalCudaMemory<char>> deviceMem_;
|
||||
void* devicePtr_;
|
||||
std::shared_ptr<char> mcPtr_;
|
||||
size_t bufferSize_;
|
||||
|
||||
public:
|
||||
using DeviceHandle = DeviceMulticastPointerDeviceHandle;
|
||||
DeviceMulticastPointer(std::shared_ptr<PhysicalCudaMemory<char>> deviceMem, std::shared_ptr<char> mcPtr,
|
||||
size_t bufferSize)
|
||||
: deviceMem_(deviceMem), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
|
||||
DeviceMulticastPointer(void* devicePtr, std::shared_ptr<char> mcPtr, size_t bufferSize)
|
||||
: devicePtr_(devicePtr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
|
||||
DeviceHandle deviceHandle();
|
||||
char* getDevicePtr();
|
||||
void* getDevicePtr();
|
||||
|
||||
friend class NvlsConnection;
|
||||
};
|
||||
|
||||
std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
|
||||
|
||||
/// The \p handle to the allocation (its lifetime is managed by the caller)
|
||||
/// and the \p size of the allocation.
|
||||
std::shared_ptr<char> bindAllocatedCuda(CUmemGenericAllocationHandle memHandle, size_t size);
|
||||
/// @brief bind the allocated memory via @ref mscclpp::allocSharedPhysicalCuda to the multicast handle. The behavior
|
||||
/// is undefined if the devicePtr is not allocated by @ref mscclpp::allocSharedPhysicalCuda.
|
||||
/// @param devicePtr
|
||||
/// @param size
|
||||
/// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
|
||||
DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);
|
||||
|
||||
size_t getMultiCastMinGranularity();
|
||||
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
#include <cuda_fp16.h>
|
||||
#endif // defined(MSCCLPP_DEVICE_CUDA)
|
||||
|
||||
#include <mscclpp/gpu_data_types.hpp>
|
||||
|
||||
#include "device.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
@@ -27,7 +29,11 @@ struct DeviceMulticastPointerDeviceHandle {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
template <typename TValue, typename T>
|
||||
MSCCLPP_DEVICE_INLINE static void multimemLoadReduce(TValue& val, T* ptr) {
|
||||
if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
|
||||
if constexpr (std::is_same_v<TValue, int32_t> && std::is_same_v<T, int32_t>) {
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add.s32 %0, [%1];" : "=r"(val) : "l"(ptr) : "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint32_t> && std::is_same_v<T, uint32_t>) {
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];" : "=r"(val) : "l"(ptr) : "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add.v4.f32 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
@@ -51,6 +57,13 @@ struct DeviceMulticastPointerDeviceHandle {
|
||||
: "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __half2>) {
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add.f16x2 {%0}, [%1];" : "=r"(val.x) : "l"(ptr) : "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, __bfloat162>) {
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
: "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __bfloat162>) {
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add.bf16x2 {%0}, [%1];" : "=r"(val.x) : "l"(ptr) : "memory");
|
||||
} else {
|
||||
static_assert(dependentFalse<T>, "Not supported type");
|
||||
}
|
||||
@@ -58,7 +71,11 @@ struct DeviceMulticastPointerDeviceHandle {
|
||||
|
||||
template <typename TValue, typename T>
|
||||
MSCCLPP_DEVICE_INLINE static void multimemStore(const TValue& val, T* ptr) {
|
||||
if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
|
||||
if constexpr (std::is_same_v<TValue, int32_t> && std::is_same_v<T, int32_t>) {
|
||||
asm volatile("multimem.st.relaxed.sys.global.s32 [%0], %1;" ::"l"(ptr), "r"(val) : "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint32_t> && std::is_same_v<T, uint32_t>) {
|
||||
asm volatile("multimem.st.relaxed.sys.global.u32 [%0], %1;" ::"l"(ptr), "r"(val) : "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, float>) {
|
||||
asm volatile("multimem.st.relaxed.sys.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y),
|
||||
"r"(val.z), "r"(val.w)
|
||||
: "memory");
|
||||
@@ -76,6 +93,12 @@ struct DeviceMulticastPointerDeviceHandle {
|
||||
: "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __half2>) {
|
||||
asm volatile("multimem.st.relaxed.sys.global.f16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint4> && std::is_same_v<T, __bfloat162>) {
|
||||
asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y),
|
||||
"r"(val.z), "r"(val.w)
|
||||
: "memory");
|
||||
} else if constexpr (std::is_same_v<TValue, uint1> && std::is_same_v<T, __bfloat162>) {
|
||||
asm volatile("multimem.st.relaxed.sys.global.bf16x2 [%0], {%1};" ::"l"(ptr), "r"(val.x) : "memory");
|
||||
} else {
|
||||
static_assert(dependentFalse<T>, "Not supported type");
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@ from ._mscclpp import (
|
||||
PacketType,
|
||||
version,
|
||||
is_nvls_supported,
|
||||
alloc_shared_physical_cuda,
|
||||
npkit,
|
||||
)
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ extern void register_numa(nb::module_& m);
|
||||
extern void register_nvls(nb::module_& m);
|
||||
extern void register_executor(nb::module_& m);
|
||||
extern void register_npkit(nb::module_& m);
|
||||
extern void register_gpu_utils(nb::module_& m);
|
||||
|
||||
template <typename T>
|
||||
void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
|
||||
@@ -194,4 +195,5 @@ NB_MODULE(_mscclpp, m) {
|
||||
register_nvls(m);
|
||||
register_executor(m);
|
||||
register_npkit(m);
|
||||
register_gpu_utils(m);
|
||||
}
|
||||
|
||||
30
python/mscclpp/gpu_utils_py.cpp
Normal file
30
python/mscclpp/gpu_utils_py.cpp
Normal file
@@ -0,0 +1,30 @@
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/shared_ptr.h>
|
||||
|
||||
// #include <memory>
|
||||
#include <mscclpp/gpu_data_types.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
|
||||
namespace nb = nanobind;
|
||||
using namespace mscclpp;
|
||||
|
||||
class PyCudaMemory {
|
||||
public:
|
||||
PyCudaMemory(size_t size) : size_(size) { ptr_ = allocSharedPhysicalCuda<char>(size); }
|
||||
|
||||
uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
|
||||
size_t size() const { return size_; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<char> ptr_;
|
||||
size_t size_;
|
||||
};
|
||||
|
||||
void register_gpu_utils(nb::module_& m) {
|
||||
nb::class_<PyCudaMemory>(m, "PyCudaMemory")
|
||||
.def(nb::init<size_t>(), nb::arg("size"))
|
||||
.def("get_ptr", &PyCudaMemory::getPtr, "Get the raw pointer")
|
||||
.def("size", &PyCudaMemory::size, "Get the size of the allocated memory");
|
||||
m.def(
|
||||
"alloc_shared_physical_cuda", [](size_t size) { return std::make_shared<PyCudaMemory>(size); }, nb::arg("size"));
|
||||
}
|
||||
@@ -30,7 +30,7 @@ void register_nvls(nb::module_& m) {
|
||||
});
|
||||
|
||||
nb::class_<NvlsConnection>(m, "NvlsConnection")
|
||||
.def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda)
|
||||
.def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("devicePtr"), nb::arg("size"))
|
||||
.def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
|
||||
|
||||
m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import cupy as cp
|
||||
import ctypes
|
||||
from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
|
||||
from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore, alloc_shared_physical_cuda
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, pack
|
||||
|
||||
@@ -443,12 +443,15 @@ class MscclppAllReduce6:
|
||||
self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
|
||||
min_gran = self.nvls_connection.get_multicast_min_granularity()
|
||||
aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
|
||||
self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(
|
||||
aligned_buffer_size
|
||||
buffer_raw = alloc_shared_physical_cuda(aligned_buffer_size)
|
||||
self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
|
||||
buffer_raw.get_ptr(), aligned_buffer_size
|
||||
) # just using recommended size for now
|
||||
self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
|
||||
|
||||
self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0)
|
||||
self.cp_memory_ptr = cp.cuda.MemoryPointer(
|
||||
cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, buffer_raw), 0
|
||||
)
|
||||
self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)
|
||||
|
||||
# create a sm_channel for each remote neighbor
|
||||
|
||||
@@ -8,6 +8,8 @@ from mscclpp import (
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
npkit,
|
||||
alloc_shared_physical_cuda,
|
||||
is_nvls_supported,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, pack
|
||||
@@ -125,6 +127,18 @@ def dtype_to_mscclpp_dtype(dtype):
|
||||
raise ValueError(f"Unknown data type: {dtype}")
|
||||
|
||||
|
||||
def allocate_buffer(nelems, dtype):
|
||||
if is_nvls_supported:
|
||||
buffer_raw = alloc_shared_physical_cuda(nelems * cp.dtype(dtype).itemsize)
|
||||
buffer_ptr = cp.cuda.MemoryPointer(
|
||||
cp.cuda.UnownedMemory(buffer_raw.get_ptr(), buffer_raw.size(), buffer_raw), 0
|
||||
)
|
||||
buffer = cp.ndarray(nelems, dtype=dtype, memptr=buffer_ptr)
|
||||
return buffer
|
||||
else:
|
||||
return cp.zeros(nelems, dtype=dtype)
|
||||
|
||||
|
||||
def build_bufs(
|
||||
execution_plan_name: str,
|
||||
size: int,
|
||||
@@ -144,14 +158,14 @@ def build_bufs(
|
||||
nelems_input = nelems
|
||||
nelems_output = nelems
|
||||
|
||||
result_buf = cp.zeros(nelems_output, dtype=dtype)
|
||||
result_buf = allocate_buffer(nelems_output, dtype=dtype)
|
||||
if in_place:
|
||||
if "allgather" in execution_plan_name:
|
||||
input_buf = cp.split(result_buf, num_ranks)[rank]
|
||||
else:
|
||||
input_buf = result_buf
|
||||
else:
|
||||
input_buf = cp.zeros(nelems_input, dtype=dtype)
|
||||
input_buf = allocate_buffer(nelems_input, dtype=dtype)
|
||||
test_buf = cp.zeros(nelems_output, dtype=dtype)
|
||||
|
||||
return input_buf, result_buf, test_buf
|
||||
|
||||
@@ -52,6 +52,8 @@ auto getOpType = [](const std::string& str) {
|
||||
return mscclpp::OperationType::TRANSFORM_TO_PACKET;
|
||||
} else if (str == "rpkt") {
|
||||
return mscclpp::OperationType::REDUCE_PACKET;
|
||||
} else if (str == "glres") {
|
||||
return mscclpp::OperationType::MULTI_LOAD_REDUCE_STORE;
|
||||
} else {
|
||||
throw mscclpp::Error("Invalid operation type", mscclpp::ErrorCode::ExecutorError);
|
||||
}
|
||||
@@ -76,11 +78,15 @@ auto convertToChannelType = [](const std::string& str) {
|
||||
return mscclpp::ChannelType::PROXY;
|
||||
} else if (str == "none") {
|
||||
return mscclpp::ChannelType::NONE;
|
||||
} else if (str == "nvls") {
|
||||
return mscclpp::ChannelType::NVLS;
|
||||
} else {
|
||||
throw mscclpp::Error("Invalid channel type", mscclpp::ErrorCode::ExecutorError);
|
||||
}
|
||||
};
|
||||
|
||||
std::set groupChannelType{mscclpp::ChannelType::NVLS};
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace mscclpp {
|
||||
@@ -100,7 +106,7 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy
|
||||
}
|
||||
|
||||
std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfosByDstRank(int rank, BufferType bufferType) const {
|
||||
auto pred = [rank, bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
|
||||
auto pred = [bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
|
||||
return filter(this->channelInfosByDstRank.at(rank), pred);
|
||||
}
|
||||
|
||||
@@ -126,6 +132,8 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getUnpairedChannelInfos(int rank,
|
||||
return unpaired;
|
||||
}
|
||||
|
||||
std::vector<NvlsInfo> ExecutionPlan::Impl::getNvlsInfos(int rank) const { return this->nvlsInfos.at(rank); }
|
||||
|
||||
std::vector<int> ExecutionPlan::Impl::getConnectedPeers(int rank) const {
|
||||
std::set<int> peers;
|
||||
for (const auto& info : this->channelInfos.at(rank)) {
|
||||
@@ -181,6 +189,8 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize,
|
||||
if (protocol == "LL") {
|
||||
this->isUsingPacket = true;
|
||||
}
|
||||
this->inputSize = inputSize;
|
||||
this->outputSize = outputSize;
|
||||
this->nThreadsPerBlock = obj.value("num_threads_per_block", 1024);
|
||||
const auto& gpus = obj["gpus"];
|
||||
|
||||
@@ -192,9 +202,6 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize,
|
||||
this->chunkGroups[rank] = gpu["chunkGroups"];
|
||||
}
|
||||
this->setupChannels(gpus);
|
||||
|
||||
this->inputSize = inputSize;
|
||||
this->outputSize = outputSize;
|
||||
this->setupOperations(gpus, contsSrcOffset, constDstOffset);
|
||||
}
|
||||
|
||||
@@ -224,15 +231,24 @@ void ExecutionPlan::Impl::lightLoadExecutionPlan(size_t inputSize, size_t output
|
||||
this->setupOperations(gpus, contsSrcOffset, constDstOffset);
|
||||
}
|
||||
|
||||
// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors.
|
||||
// Step 2. For each threadblock, construct a vector of channel indexes and keys.
|
||||
void ExecutionPlan::Impl::setupChannels(const json& gpus) {
|
||||
using mapKey = std::tuple<int, BufferType, BufferType, ChannelType>;
|
||||
std::map<mapKey, std::vector<int>> chanConnectedPeersMap;
|
||||
for (const auto& gpu : gpus) {
|
||||
int rank = gpu["id"];
|
||||
std::vector<ChannelInfo> channelInfos;
|
||||
for (const auto& channel : gpu["channels"]) {
|
||||
void ExecutionPlan::Impl::parseChannels(
|
||||
const json& gpu, std::vector<ChannelInfo>& channelInfos, std::vector<NvlsInfo>& nvlsInfos,
|
||||
std::map<std::tuple<int, BufferType, BufferType, ChannelType>, std::vector<int>>& chanConnectedPeersMap, int rank) {
|
||||
for (const auto& channel : gpu["channels"]) {
|
||||
ChannelType chanType = convertToChannelType(channel["type"]);
|
||||
|
||||
if (chanType == ChannelType::NVLS) {
|
||||
NvlsInfo info;
|
||||
info.bufferType = convertToBufferType(channel["buff"]);
|
||||
for (const auto& group : channel["rankGroups"]) {
|
||||
info.bufferSize = (int)group["size"] * this->getUpperBoundChunkSize(rank, this->inputSize, this->outputSize);
|
||||
info.ranks.clear();
|
||||
for (int rank : group["ranks"]) {
|
||||
info.ranks.push_back(rank);
|
||||
}
|
||||
nvlsInfos.push_back(info);
|
||||
}
|
||||
} else {
|
||||
ChannelInfo info;
|
||||
info.srcBufferType = convertToBufferType(channel["srcbuff"]);
|
||||
info.dstBufferType = convertToBufferType(channel["dstbuff"]);
|
||||
@@ -244,7 +260,21 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
|
||||
}
|
||||
channelInfos.push_back(info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors.
|
||||
// Step 2. For each threadblock, construct a vector of channel indexes and keys.
|
||||
void ExecutionPlan::Impl::setupChannels(const json& gpus) {
|
||||
using mapKey = std::tuple<int, BufferType, BufferType, ChannelType>;
|
||||
std::map<mapKey, std::vector<int>> chanConnectedPeersMap;
|
||||
for (const auto& gpu : gpus) {
|
||||
int rank = gpu["id"];
|
||||
std::vector<ChannelInfo> channelInfos;
|
||||
std::vector<NvlsInfo> nvlsInfos;
|
||||
this->parseChannels(gpu, channelInfos, nvlsInfos, chanConnectedPeersMap, rank);
|
||||
this->channelInfos[rank] = channelInfos;
|
||||
this->nvlsInfos[rank] = nvlsInfos;
|
||||
}
|
||||
|
||||
for (const auto& [key, connectedFrom] : chanConnectedPeersMap) {
|
||||
@@ -260,21 +290,30 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
|
||||
// setup threadblockChannelMap
|
||||
for (const auto& gpu : gpus) {
|
||||
int rank = gpu["id"];
|
||||
auto channelTypes = {ChannelType::SM, ChannelType::PROXY};
|
||||
auto channelTypes = {ChannelType::SM, ChannelType::PROXY, ChannelType::NVLS};
|
||||
std::unordered_map<ChannelKey, std::vector<int>> channelMap;
|
||||
for (auto channelType : channelTypes) {
|
||||
const std::vector<ChannelInfo> channelInfos = this->getChannelInfos(rank, channelType);
|
||||
int index = 0;
|
||||
for (const auto& info : channelInfos) {
|
||||
ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType};
|
||||
for (size_t i = 0; i < info.connectedPeers.size(); i++) {
|
||||
if (channelType == ChannelType::NVLS) {
|
||||
const std::vector<NvlsInfo> nvlsInfos = this->getNvlsInfos(rank);
|
||||
for (const auto& info : nvlsInfos) {
|
||||
ChannelKey key = {info.bufferType, info.bufferType, ChannelType::NVLS};
|
||||
channelMap[key].push_back(index++);
|
||||
}
|
||||
} else {
|
||||
for (const auto& info : channelInfos) {
|
||||
ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType};
|
||||
for (size_t i = 0; i < info.connectedPeers.size(); i++) {
|
||||
channelMap[key].push_back(index++);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int nthreadblocks = gpu["threadblocks"].size();
|
||||
this->threadblockSMChannelMap[rank].resize(nthreadblocks);
|
||||
this->threadblockProxyChannelMap[rank].resize(nthreadblocks);
|
||||
this->threadblockNvlsChannelMap[rank].resize(nthreadblocks);
|
||||
for (const auto& threadblock : gpu["threadblocks"]) {
|
||||
for (const auto& channel : threadblock["channels"]) {
|
||||
ChannelType channelType = convertToChannelType(channel["ctype"]);
|
||||
@@ -284,6 +323,8 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
|
||||
this->threadblockSMChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
|
||||
} else if (channelType == ChannelType::PROXY) {
|
||||
this->threadblockProxyChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
|
||||
} else if (channelType == ChannelType::NVLS) {
|
||||
this->threadblockNvlsChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -314,6 +355,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
|
||||
int threadblockId = threadblock["id"];
|
||||
const auto& smChannels = this->threadblockSMChannelMap[rank][threadblockId];
|
||||
const auto& proxyChannels = this->threadblockProxyChannelMap[rank][threadblockId];
|
||||
const auto& nvlsChannels = this->threadblockNvlsChannelMap[rank][threadblockId];
|
||||
for (size_t i = 0; i < smChannels.size(); i++) {
|
||||
const auto& [_, key] = smChannels[i];
|
||||
channelIndexes[key].push_back(i);
|
||||
@@ -322,6 +364,10 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
|
||||
const auto& [_, key] = proxyChannels[i];
|
||||
channelIndexes[key].push_back(i);
|
||||
}
|
||||
for (size_t i = 0; i < nvlsChannels.size(); i++) {
|
||||
const auto& [_, key] = nvlsChannels[i];
|
||||
channelIndexes[key].push_back(i);
|
||||
}
|
||||
for (const auto& op : threadblock["ops"]) {
|
||||
Operation operation = {};
|
||||
std::vector<uint32_t> chunkIndexes;
|
||||
@@ -330,17 +376,24 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
|
||||
operation.channelType = convertToChannelType(op["ctype"]);
|
||||
}
|
||||
if (op.contains("i_cids")) {
|
||||
operation.nInputs = op["i_cids"].size();
|
||||
for (int i = 0; i < operation.nInputs; i++) {
|
||||
BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
|
||||
BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
|
||||
// Get the relevant channel index in rank channelInfos
|
||||
operation.inputChannelIndexes[i] =
|
||||
channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
|
||||
operation.inputOffsets[i] =
|
||||
this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["i_cids"][i]["off"]) +
|
||||
getConstOffset(srcBufferType);
|
||||
chunkIndexes.push_back((uint32_t)op["i_cids"][i]["off"]);
|
||||
if (operation.channelType == mscclpp::ChannelType::NVLS) {
|
||||
BufferType srcBufferType = convertToBufferType(op["srcbuff"]);
|
||||
operation.nvlsInputIndex =
|
||||
channelIndexes[{srcBufferType, srcBufferType, ChannelType::NVLS}][op["i_cids"][0]["id"]];
|
||||
chunkIndexes.push_back((uint32_t)op["srcoff"]);
|
||||
} else {
|
||||
operation.nInputs = op["i_cids"].size();
|
||||
for (int i = 0; i < operation.nInputs; i++) {
|
||||
BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
|
||||
BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
|
||||
// Get the relevant channel index in rank channelInfos
|
||||
operation.inputChannelIndexes[i] =
|
||||
channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
|
||||
operation.inputOffsets[i] =
|
||||
this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["i_cids"][i]["off"]) +
|
||||
getConstOffset(srcBufferType);
|
||||
chunkIndexes.push_back((uint32_t)op["i_cids"][i]["off"]);
|
||||
}
|
||||
}
|
||||
}
|
||||
// will have either srcs or i_cids
|
||||
@@ -357,14 +410,21 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
|
||||
if (op.contains("o_cids")) {
|
||||
operation.nOutputs = op["o_cids"].size();
|
||||
for (int i = 0; i < operation.nOutputs; i++) {
|
||||
BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
|
||||
BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
|
||||
operation.outputChannelIndexes[i] =
|
||||
channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
|
||||
operation.outputOffsets[i] =
|
||||
this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["o_cids"][i]["off"]) +
|
||||
getConstOffset(dstBufferType);
|
||||
chunkIndexes.push_back((uint32_t)op["o_cids"][i]["off"]);
|
||||
if (operation.channelType == mscclpp::ChannelType::NVLS) {
|
||||
BufferType dstBufferType = convertToBufferType(op["dstbuff"]);
|
||||
operation.nvlsInputIndex =
|
||||
channelIndexes[{dstBufferType, dstBufferType, ChannelType::NVLS}][op["o_cids"][0]["id"]];
|
||||
chunkIndexes.push_back((uint32_t)op["dstoff"]);
|
||||
} else {
|
||||
BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
|
||||
BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
|
||||
operation.outputChannelIndexes[i] =
|
||||
channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
|
||||
operation.outputOffsets[i] =
|
||||
this->getOffset(rank, this->inputSize, this->outputSize, (uint32_t)op["o_cids"][i]["off"]) +
|
||||
getConstOffset(dstBufferType);
|
||||
chunkIndexes.push_back((uint32_t)op["o_cids"][i]["off"]);
|
||||
}
|
||||
}
|
||||
}
|
||||
// will have either dsts or o_cids
|
||||
@@ -460,11 +520,19 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
|
||||
return nChunkSize;
|
||||
}
|
||||
|
||||
size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const {
|
||||
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
|
||||
uint32_t nChunks = sizePerRank.second;
|
||||
return (sizePerRank.first + nChunks - 1) / nChunks;
|
||||
}
|
||||
|
||||
void ExecutionPlan::Impl::reset() {
|
||||
this->operations.clear();
|
||||
this->channelInfos.clear();
|
||||
this->nvlsInfos.clear();
|
||||
this->threadblockSMChannelMap.clear();
|
||||
this->threadblockProxyChannelMap.clear();
|
||||
this->threadblockNvlsChannelMap.clear();
|
||||
this->inputChunks.clear();
|
||||
this->outputChunks.clear();
|
||||
this->scratchChunks.clear();
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <mscclpp/executor.hpp>
|
||||
#include <mscclpp/nvls.hpp>
|
||||
#include <mscclpp/proxy_channel.hpp>
|
||||
#include <mscclpp/sm_channel.hpp>
|
||||
#include <set>
|
||||
@@ -23,6 +24,19 @@ struct ExecutionContextKey {
|
||||
}
|
||||
};
|
||||
|
||||
void* getBuffer(BufferType type, void* sendbuff, void* recvbuff, void* scratch) {
|
||||
switch (type) {
|
||||
case BufferType::INPUT:
|
||||
return sendbuff;
|
||||
case BufferType::OUTPUT:
|
||||
return recvbuff;
|
||||
case BufferType::SCRATCH:
|
||||
return scratch;
|
||||
default:
|
||||
throw Error("Invalid buffer type", ErrorCode::ExecutorError);
|
||||
}
|
||||
};
|
||||
|
||||
struct DeviceExecutionPlanKey {
|
||||
size_t inputMessageSize;
|
||||
size_t outputMessageSize;
|
||||
@@ -97,11 +111,13 @@ namespace mscclpp {
|
||||
struct ExecutionContext {
|
||||
std::shared_ptr<ProxyService> proxyService;
|
||||
std::unordered_map<int, std::shared_ptr<Connection>> connections;
|
||||
std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
|
||||
std::unordered_map<std::pair<BufferType, int>, mscclpp::RegisteredMemory> registeredMemories;
|
||||
std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
|
||||
std::vector<mscclpp::SemaphoreId> proxySemaphores;
|
||||
std::vector<mscclpp::SmChannel> smChannels;
|
||||
std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
|
||||
std::vector<mscclpp::NvlsConnection::DeviceMulticastPointer> nvlsChannels;
|
||||
std::unordered_map<DeviceExecutionPlanKey, std::vector<DeviceExecutionPlan>> deviceExecutionPlans;
|
||||
std::unordered_map<DeviceExecutionPlanKey, std::shared_ptr<char>> deviceExecutionPlansBuffers;
|
||||
std::shared_ptr<char> scratchBuffer;
|
||||
@@ -152,7 +168,12 @@ struct Executor::Impl {
|
||||
|
||||
ExecutionContext context;
|
||||
size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize, recvBufferSize);
|
||||
std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
|
||||
std::shared_ptr<char> scratchBuffer;
|
||||
if (isNvlsSupported()) {
|
||||
scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
|
||||
} else {
|
||||
scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
|
||||
}
|
||||
context.scratchBuffer = scratchBuffer;
|
||||
context.scratchBufferSize = scratchBufferSize;
|
||||
context.proxyService = std::make_shared<ProxyService>();
|
||||
@@ -160,6 +181,7 @@ struct Executor::Impl {
|
||||
this->setupConnections(context, rank, plan);
|
||||
this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
|
||||
this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
|
||||
this->setupNvlsChannels(context, sendbuff, recvbuff, rank, plan);
|
||||
this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
|
||||
context.deviceExecutionPlansBuffers[devicePlanKey] =
|
||||
allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
|
||||
@@ -202,6 +224,13 @@ struct Executor::Impl {
|
||||
for (size_t i = 0; i < connectionFutures.size(); i++) {
|
||||
context.connections[connectedPeers[i]] = connectionFutures[i].get();
|
||||
}
|
||||
|
||||
std::vector<NvlsInfo> nvlsInfos = plan.impl_->getNvlsInfos(rank);
|
||||
for (const NvlsInfo& info : nvlsInfos) {
|
||||
std::shared_ptr<NvlsConnection> nvlsConnection =
|
||||
mscclpp::connectNvlsCollective(this->comm, info.ranks, info.bufferSize);
|
||||
context.nvlsConnections.push_back(nvlsConnection);
|
||||
}
|
||||
}
|
||||
|
||||
void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
|
||||
@@ -284,18 +313,6 @@ struct Executor::Impl {
|
||||
context.smSemaphores = std::move(smSemaphores);
|
||||
context.proxySemaphores = std::move(proxySemaphores);
|
||||
|
||||
auto getBuffer = [&](BufferType type) {
|
||||
switch (type) {
|
||||
case BufferType::INPUT:
|
||||
return sendbuff;
|
||||
case BufferType::OUTPUT:
|
||||
return recvbuff;
|
||||
case BufferType::SCRATCH:
|
||||
return (void*)context.scratchBuffer.get();
|
||||
default:
|
||||
throw Error("Invalid buffer type", ErrorCode::ExecutorError);
|
||||
}
|
||||
};
|
||||
auto getBufferSize = [&](BufferType type) {
|
||||
switch (type) {
|
||||
case BufferType::INPUT:
|
||||
@@ -313,7 +330,7 @@ struct Executor::Impl {
|
||||
std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
|
||||
int index = 0;
|
||||
for (ChannelInfo& info : channelInfos) {
|
||||
void* src = getBuffer(info.srcBufferType);
|
||||
void* src = getBuffer(info.srcBufferType, sendbuff, recvbuff, context.scratchBuffer.get());
|
||||
size_t bufferSize = getBufferSize(info.srcBufferType);
|
||||
TransportFlags transport = getTransportFlags(channelInfos, rank);
|
||||
RegisteredMemory localMemory = this->comm->registerMemory(src, bufferSize, transport);
|
||||
@@ -332,6 +349,19 @@ struct Executor::Impl {
|
||||
}
|
||||
}
|
||||
|
||||
void setupNvlsChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, int rank,
|
||||
const ExecutionPlan& plan) {
|
||||
std::vector<NvlsInfo> nvlsInfos = plan.impl_->getNvlsInfos(rank);
|
||||
for (size_t i = 0; i < nvlsInfos.size(); i++) {
|
||||
std::shared_ptr<NvlsConnection> nvlsConnection = context.nvlsConnections[i];
|
||||
NvlsInfo info = nvlsInfos[i];
|
||||
void* buffer = getBuffer(info.bufferType, sendbuff, recvbuff, context.scratchBuffer.get());
|
||||
NvlsConnection::DeviceMulticastPointer deviceMulticastPointer =
|
||||
nvlsConnection->bindAllocatedMemory((CUdeviceptr)buffer, info.bufferSize);
|
||||
context.nvlsChannels.push_back(deviceMulticastPointer);
|
||||
}
|
||||
}
|
||||
|
||||
void setupDeviceExecutionPlan(ExecutionContext& context, const DeviceExecutionPlanKey& key, int rank,
|
||||
const ExecutionPlan& plan) {
|
||||
std::vector<DeviceExecutionPlan> deviceExecutionPlans;
|
||||
@@ -349,6 +379,10 @@ struct Executor::Impl {
|
||||
for (const auto& [index, _] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) {
|
||||
deviceExecutionPlan.channels.proxyChannels[chanIndex++] = mscclpp::deviceHandle(context.proxyChannels[index]);
|
||||
}
|
||||
chanIndex = 0;
|
||||
for (const auto& [index, _] : plan.impl_->threadblockNvlsChannelMap.at(rank).at(threadblock)) {
|
||||
deviceExecutionPlan.channels.nvlsChannels[chanIndex++] = mscclpp::deviceHandle(context.nvlsChannels[index]);
|
||||
}
|
||||
for (size_t i = 0; i < ops.size(); i++) {
|
||||
deviceExecutionPlan.operations[i] = ops[i];
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#ifndef MSCCLPP_EXECUTION_COMMON_HPP_
|
||||
#define MSCCLPP_EXECUTION_COMMON_HPP_
|
||||
|
||||
#include <mscclpp/nvls.hpp>
|
||||
#include <mscclpp/proxy_channel.hpp>
|
||||
#include <mscclpp/sm_channel.hpp>
|
||||
|
||||
@@ -24,6 +25,7 @@ enum class ChannelType : uint8_t {
|
||||
NONE,
|
||||
SM,
|
||||
PROXY,
|
||||
NVLS,
|
||||
};
|
||||
|
||||
// NOTE(chhwang): any modification here requires corresponding updates in `tools/npkit/npkit_trace_generator.py`.
|
||||
@@ -46,11 +48,13 @@ enum class OperationType : uint8_t {
|
||||
REDUCE_SEND_PACKET,
|
||||
READ_REDUCE_COPY,
|
||||
READ_REDUCE_COPY_SEND,
|
||||
MULTI_LOAD_REDUCE_STORE,
|
||||
};
|
||||
|
||||
struct Channels {
|
||||
mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
|
||||
mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> proxyChannels[MAX_CHANNEL];
|
||||
mscclpp::DeviceHandle<mscclpp::NvlsConnection::DeviceMulticastPointer> nvlsChannels[MAX_CHANNEL];
|
||||
};
|
||||
|
||||
struct Operation {
|
||||
@@ -61,12 +65,18 @@ struct Operation {
|
||||
uint8_t nInputs;
|
||||
uint8_t nOutputs;
|
||||
union {
|
||||
// For ops which require reading from multiple remote sources
|
||||
uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
|
||||
// For ops which require reading from multiple local sources
|
||||
BufferType inputBufferType;
|
||||
uint8_t nvlsInputIndex;
|
||||
};
|
||||
union {
|
||||
// For ops which require writing to multiple remote destinations
|
||||
uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
|
||||
// For ops which require writing to multiple local destinations
|
||||
BufferType outputBufferType;
|
||||
uint8_t nvlsOutputIndex;
|
||||
};
|
||||
uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
|
||||
uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
|
||||
@@ -75,12 +85,12 @@ struct Operation {
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
// total size = 1920 + 6400 + 4 + 4(padding) + 12(align) = 8336 bytes
|
||||
// total size = 2304 + 6400 + 4 + 12(padding) = 8720 bytes
|
||||
struct __attribute__((aligned(16))) DeviceExecutionPlan {
|
||||
uint8_t nSmChannels; // 1 bytes
|
||||
uint8_t nProxyChannels; // 1 bytes
|
||||
uint16_t nOperations; // 2 bytes
|
||||
Channels channels; // 1920 bytes
|
||||
Channels channels; // 2304 bytes
|
||||
Operation operations[MAX_OPERATION]; // 64 * 100 = 6400 bytes
|
||||
};
|
||||
|
||||
|
||||
@@ -15,7 +15,8 @@
|
||||
#include "execution_common.hpp"
|
||||
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
#include "gpu_data_types.hpp"
|
||||
#include <mscclpp/gpu_data_types.hpp>
|
||||
#include <mscclpp/nvls_device.hpp>
|
||||
|
||||
namespace {
|
||||
template <typename To, typename From>
|
||||
@@ -138,6 +139,34 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) {
|
||||
return add_vectors_helper<__bfloat162>(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct VectorType {
|
||||
using type = T;
|
||||
using nvls_type = T;
|
||||
using nvls_type2 = T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VectorType<__half> {
|
||||
using type = __half2;
|
||||
using nvls_type = uint4;
|
||||
using nvls_type2 = uint1;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VectorType<__bfloat16> {
|
||||
using type = __bfloat162;
|
||||
using nvls_type = uint4;
|
||||
using nvls_type2 = uint1;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VectorType<float> {
|
||||
using type = float;
|
||||
using nvls_type = uint4;
|
||||
using nvls_type2 = uint1;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
|
||||
@@ -401,6 +430,37 @@ MSCCLPP_DEVICE_INLINE void handleCopy(void* dst, void* src, uint32_t dstOffset,
|
||||
Element::copy(dstData, srcData, size, threadIdx.x, blockDim.x);
|
||||
}
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
|
||||
template <typename T>
|
||||
MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* dst, T* src, uint32_t dstOffset, uint32_t srcOffset,
|
||||
size_t size) {
|
||||
using vectorType = typename VectorType<T>::type;
|
||||
using nvlsType = typename VectorType<T>::nvls_type;
|
||||
// nvls can only handle 4 bytes alignment
|
||||
assert(size % sizeof(vectorType) == 0);
|
||||
const size_t nInt4 = size / sizeof(nvlsType);
|
||||
const size_t srcOffset4 = srcOffset / sizeof(nvlsType);
|
||||
const size_t dstOffset4 = dstOffset / sizeof(nvlsType);
|
||||
nvlsType* src4 = (nvlsType*)src;
|
||||
nvlsType* dst4 = (nvlsType*)dst;
|
||||
for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) {
|
||||
nvlsType val;
|
||||
DeviceMulticastPointerDeviceHandle::multimemLoadReduce(val, (vectorType*)(src4 + srcOffset4 + idx));
|
||||
DeviceMulticastPointerDeviceHandle::multimemStore(val, (vectorType*)(dst4 + dstOffset4 + idx));
|
||||
}
|
||||
// handle rest of data
|
||||
size_t processed = nInt4 * sizeof(nvlsType);
|
||||
using nvlsType2 = typename VectorType<T>::nvls_type2;
|
||||
const size_t startIdx = (srcOffset + processed) / sizeof(nvlsType2);
|
||||
const size_t endIdx = (dstOffset + size) / sizeof(nvlsType2);
|
||||
for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) {
|
||||
nvlsType2 val;
|
||||
DeviceMulticastPointerDeviceHandle::multimemLoadReduce(val, (vectorType*)src + idx);
|
||||
DeviceMulticastPointerDeviceHandle::multimemStore(val, (vectorType*)dst + idx);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T, typename PacketType = LL16Packet>
|
||||
__global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
|
||||
size_t scratchSize, DeviceExecutionPlan* plan, uint32_t flag
|
||||
@@ -433,6 +493,8 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
|
||||
Operation* operations = localPlan->operations;
|
||||
DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
|
||||
DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
|
||||
[[maybe_unused]] DeviceHandle<NvlsConnection::DeviceMulticastPointer>* nvlsChannels =
|
||||
localPlan->channels.nvlsChannels;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
#if defined(MSCCLPP_DEVICE_HIP)
|
||||
@@ -530,6 +592,13 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
|
||||
handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, smChannels, op.outputChannelIndexes,
|
||||
op.outputOffsets, op.nOutputs, op.size);
|
||||
}
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
|
||||
else if (op.type == OperationType::MULTI_LOAD_REDUCE_STORE) {
|
||||
T* dst = (T*)(nvlsChannels[op.nvlsOutputIndex].mcPtr);
|
||||
T* src = (T*)(nvlsChannels[op.nvlsInputIndex].mcPtr);
|
||||
handleMultiLoadReduceStore(dst, src, op.dstOffset, op.srcOffset, op.size);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT)
|
||||
NpKit::CollectGpuEventShm(NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT + (int)op.type, op.size, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
|
||||
@@ -23,6 +23,12 @@ struct ChannelKey {
|
||||
channelType == other.channelType;
|
||||
}
|
||||
};
|
||||
|
||||
struct NvlsInfo {
|
||||
std::vector<int> ranks;
|
||||
size_t bufferSize;
|
||||
BufferType bufferType;
|
||||
};
|
||||
} // namespace mscclpp
|
||||
|
||||
namespace std {
|
||||
@@ -63,6 +69,7 @@ struct ExecutionPlan::Impl {
|
||||
std::vector<ChannelInfo> getChannelInfos(int rank, BufferType bufferType) const;
|
||||
std::vector<ChannelInfo> getChannelInfosByDstRank(int rank, BufferType bufferType) const;
|
||||
std::vector<ChannelInfo> getUnpairedChannelInfos(int rank, int worldSize, ChannelType channelType);
|
||||
std::vector<NvlsInfo> getNvlsInfos(int rank) const;
|
||||
std::vector<int> getConnectedPeers(int rank) const;
|
||||
std::vector<BufferType> getConnectedBufferTypes(int rank) const;
|
||||
size_t getScratchBufferSize(int rank, size_t inputSize, size_t outputSize) const;
|
||||
@@ -86,9 +93,12 @@ struct ExecutionPlan::Impl {
|
||||
std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
|
||||
std::unordered_map<int, std::vector<ChannelInfo>> channelInfosByDstRank;
|
||||
std::unordered_map<std::pair<int, ChannelType>, std::unordered_map<int, int>> channelCountMap;
|
||||
// for nvls channels
|
||||
std::unordered_map<int, std::vector<NvlsInfo>> nvlsInfos;
|
||||
// threadblockChannelMap[rank][threadblock] = [channelIndex, channelKey]
|
||||
std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockSMChannelMap;
|
||||
std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockProxyChannelMap;
|
||||
std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockNvlsChannelMap;
|
||||
std::unordered_map<int, uint32_t> inputChunks;
|
||||
std::unordered_map<int, uint32_t> outputChunks;
|
||||
std::unordered_map<int, uint32_t> scratchChunks;
|
||||
@@ -102,6 +112,13 @@ struct ExecutionPlan::Impl {
|
||||
size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const;
|
||||
size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
|
||||
const std::vector<uint32_t> offsets) const;
|
||||
size_t getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const;
|
||||
|
||||
// helper functions to setup the channels
|
||||
void parseChannels(
|
||||
const nlohmann::json& gpu, std::vector<ChannelInfo>& channelInfos, std::vector<NvlsInfo>& nvlsInfos,
|
||||
std::map<std::tuple<int, BufferType, BufferType, ChannelType>, std::vector<int>>& chanConnectedPeersMap,
|
||||
int rank);
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -27,6 +27,10 @@ struct TransportInfo {
|
||||
const IbMr* ibMr;
|
||||
IbMrInfo ibMrInfo;
|
||||
};
|
||||
struct {
|
||||
char shareableHandle[64];
|
||||
size_t offsetFromBase;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
@@ -39,6 +43,7 @@ struct RegisteredMemory::Impl {
|
||||
size_t size;
|
||||
uint64_t hostHash;
|
||||
uint64_t pidHash;
|
||||
bool isCuMemMapAlloc;
|
||||
TransportFlags transports;
|
||||
std::vector<TransportInfo> transportInfos;
|
||||
|
||||
|
||||
54
src/nvls.cc
54
src/nvls.cc
@@ -15,7 +15,7 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
#if (USE_NVLS)
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
|
||||
public:
|
||||
// use this only for the root of the NVLS
|
||||
@@ -31,10 +31,11 @@ class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection:
|
||||
void addDevice(int cudaDeviceId);
|
||||
size_t allocateBuffer(size_t size);
|
||||
void freeBuffer(size_t offset, size_t size) noexcept;
|
||||
std::shared_ptr<char> bindMemory(CUmemGenericAllocationHandle memHandle, size_t devBuffSize);
|
||||
std::shared_ptr<char> bindMemory(CUdeviceptr devicePtr, size_t devBuffSize);
|
||||
|
||||
private:
|
||||
friend class NvlsConnection;
|
||||
|
||||
CUmemGenericAllocationHandle mcHandle_;
|
||||
CUmulticastObjectProp mcProp_;
|
||||
size_t bufferSize_;
|
||||
@@ -70,8 +71,10 @@ NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) {
|
||||
throw mscclpp::SysError("getpid() failed", errno);
|
||||
}
|
||||
|
||||
INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n",
|
||||
mcProp_.size, minMcGran_, mcGran_);
|
||||
INFO(MSCCLPP_COLL,
|
||||
"NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld buffer size is "
|
||||
"%ld, adjusted size is %ld",
|
||||
mcProp_.size, minMcGran_, mcGran_, bufferSize, bufferSize_);
|
||||
}
|
||||
|
||||
NvlsConnection::Impl::Impl(const std::vector<char>& data) {
|
||||
@@ -128,6 +131,8 @@ void NvlsConnection::Impl::addDevice(int cudaDeviceId) {
|
||||
INFO(MSCCLPP_COLL, "NVLS connection created");
|
||||
}
|
||||
|
||||
// TODO(binyli): For cuMemMap, we can not map handle to va with offset not equal to 0.
|
||||
// Then we don't need to maintain the freeRanges_ list. For different memory, we could map to different mc handle.
|
||||
size_t NvlsConnection::Impl::allocateBuffer(size_t size) {
|
||||
if (freeRanges_.empty()) {
|
||||
throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage);
|
||||
@@ -187,24 +192,21 @@ void NvlsConnection::Impl::freeBuffer(size_t offset, size_t size) noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUmemGenericAllocationHandle memHandle, size_t devBuffSize) {
|
||||
std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUdeviceptr devicePtr, size_t devBuffSize) {
|
||||
devBuffSize = ((devBuffSize + minMcGran_ - 1) / minMcGran_) * minMcGran_;
|
||||
size_t offset = allocateBuffer(devBuffSize);
|
||||
MSCCLPP_CUTHROW(cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, memHandle, 0 /*memOffset*/, devBuffSize, 0));
|
||||
MSCCLPP_CUTHROW(cuMulticastBindAddr(mcHandle_, offset /*mcOffset*/, devicePtr, devBuffSize, 0));
|
||||
|
||||
char* mcPtr;
|
||||
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
int deviceId = -1;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
accessDesc.location.id = deviceId;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
|
||||
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
|
||||
setReadWriteMemoryAccess(mcPtr, devBuffSize);
|
||||
INFO(MSCCLPP_COLL, "NVLS connection bound memory at offset %ld, size %ld", offset, devBuffSize);
|
||||
|
||||
auto deleter = [=, self = shared_from_this()](char* ptr) {
|
||||
int deviceId;
|
||||
CUdevice device;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId));
|
||||
MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
|
||||
MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
|
||||
@@ -214,7 +216,8 @@ std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUmemGenericAllocationHan
|
||||
|
||||
return std::shared_ptr<char>(mcPtr, deleter);
|
||||
}
|
||||
#else // !(USE_NVLS)
|
||||
|
||||
#else // !(CUDA_NVLS_SUPPORTED)
|
||||
class NvlsConnection::Impl {
|
||||
public:
|
||||
// use this only for the root of the NVLS
|
||||
@@ -227,15 +230,15 @@ class NvlsConnection::Impl {
|
||||
std::vector<char> serialize() { throw notSupportedError; }
|
||||
size_t allocateBuffer(size_t) { throw notSupportedError; }
|
||||
void freeBuffer(size_t, size_t) { throw notSupportedError; }
|
||||
std::shared_ptr<char> bindMemory(CUmemGenericAllocationHandle, size_t) { throw notSupportedError; }
|
||||
std::shared_ptr<char> bindMemory(CUdeviceptr, size_t) { throw notSupportedError; }
|
||||
void addDevice(int) { throw notSupportedError; }
|
||||
size_t getMinMcGran() { throw notSupportedError; }
|
||||
|
||||
private:
|
||||
Error notSupportedError =
|
||||
Error("NVLS is not supported on this CUDA version (< 12.1) or kernel version (< 5.6.0)", ErrorCode::InvalidUsage);
|
||||
Error("NVLS is not supported on this CUDA version (< 12.3) or kernel version (< 5.6.0)", ErrorCode::InvalidUsage);
|
||||
};
|
||||
#endif // !(USE_NVLS)
|
||||
#endif // !(CUDA_NVLS_SUPPORTED)
|
||||
|
||||
const int NvlsConnection::DefaultNvlsBufferSize = (1 << 29);
|
||||
|
||||
@@ -254,25 +257,20 @@ NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make
|
||||
|
||||
std::vector<char> NvlsConnection::serialize() { return pimpl_->serialize(); }
|
||||
|
||||
std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
|
||||
auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->getMinMcGran());
|
||||
auto mcPtr = pimpl_->bindMemory(mem->memHandle_, size);
|
||||
return std::make_shared<DeviceMulticastPointer>(mem, mcPtr, size);
|
||||
}
|
||||
|
||||
std::shared_ptr<char> NvlsConnection::bindAllocatedCuda(CUmemGenericAllocationHandle memHandle, size_t size) {
|
||||
return pimpl_->bindMemory(memHandle, size);
|
||||
NvlsConnection::DeviceMulticastPointer NvlsConnection::bindAllocatedMemory(CUdeviceptr devicePtr, size_t size) {
|
||||
auto mcPtr = pimpl_->bindMemory(devicePtr, size);
|
||||
return DeviceMulticastPointer((void*)devicePtr, mcPtr, size);
|
||||
}
|
||||
|
||||
NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() {
|
||||
NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
|
||||
device.devicePtr = this->deviceMem_->devicePtr_;
|
||||
device.devicePtr = this->devicePtr_;
|
||||
device.mcPtr = this->mcPtr_.get();
|
||||
device.bufferSize = this->bufferSize_;
|
||||
return device;
|
||||
};
|
||||
|
||||
char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; };
|
||||
void* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return devicePtr_; };
|
||||
|
||||
size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->getMinMcGran(); }
|
||||
|
||||
|
||||
@@ -11,6 +11,65 @@
|
||||
#include "debug.h"
|
||||
#include "utils_internal.hpp"
|
||||
|
||||
#define MSCCLPP_CULOG_WARN(cmd) \
|
||||
do { \
|
||||
CUresult err = cmd; \
|
||||
if (err != CUDA_SUCCESS) { \
|
||||
const char* errStr; \
|
||||
if (cuGetErrorString(err, &errStr) != CUDA_SUCCESS) { \
|
||||
errStr = "failed to get error string"; \
|
||||
} \
|
||||
WARN("Call to " #cmd " failed, error is %s", errStr); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
namespace {
|
||||
// Get the recommended granularity for cuMemAddressReserve
|
||||
size_t getRecommendedGranularity() {
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
size_t gran = 0;
|
||||
int deviceId = -1;
|
||||
int currentDevice = -1;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGet(¤tDevice, deviceId));
|
||||
|
||||
CUmemAllocationProp prop = {};
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.requestedHandleTypes =
|
||||
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
prop.location.id = currentDevice;
|
||||
MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
return gran;
|
||||
#else
|
||||
throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
|
||||
#endif
|
||||
}
|
||||
|
||||
CUmemAllocationHandleType getNvlsCompatibleMemHandleType() {
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
return CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
#else
|
||||
throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Check if ptr is allocaed by cuMemMap
|
||||
bool isCuMemMapAllocated(void* ptr) {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
CUresult result = cuMemRetainAllocationHandle(&handle, ptr);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
MSCCLPP_CUTHROW(cuMemRelease(handle));
|
||||
if (!mscclpp::isNvlsSupported()) {
|
||||
throw mscclpp::Error("cuMemMap is used in env without NVLS support", mscclpp::ErrorCode::InvalidUsage);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl)
|
||||
@@ -23,15 +82,24 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
|
||||
if (transports.has(Transport::CudaIpc)) {
|
||||
TransportInfo transportInfo;
|
||||
transportInfo.transport = Transport::CudaIpc;
|
||||
cudaIpcMemHandle_t handle;
|
||||
|
||||
void* baseDataPtr;
|
||||
size_t baseDataSize; // dummy
|
||||
MSCCLPP_CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data));
|
||||
MSCCLPP_CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
|
||||
// TODO: bug with offset of base?
|
||||
transportInfo.cudaIpcBaseHandle = handle;
|
||||
transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
|
||||
this->isCuMemMapAlloc = isCuMemMapAllocated(baseDataPtr);
|
||||
if (this->isCuMemMapAlloc) {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, baseDataPtr));
|
||||
MSCCLPP_CUTHROW(
|
||||
cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsCompatibleMemHandleType(), 0));
|
||||
transportInfo.offsetFromBase = (char*)data - (char*)baseDataPtr;
|
||||
} else {
|
||||
cudaIpcMemHandle_t handle;
|
||||
MSCCLPP_CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr));
|
||||
// TODO: bug with offset of base?
|
||||
transportInfo.cudaIpcBaseHandle = handle;
|
||||
transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
|
||||
}
|
||||
this->transportInfos.push_back(transportInfo);
|
||||
}
|
||||
if ((transports & AllIBTransports).any()) {
|
||||
@@ -75,6 +143,8 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
|
||||
std::copy_n(reinterpret_cast<char*>(&pimpl_->size), sizeof(pimpl_->size), std::back_inserter(result));
|
||||
std::copy_n(reinterpret_cast<char*>(&pimpl_->hostHash), sizeof(pimpl_->hostHash), std::back_inserter(result));
|
||||
std::copy_n(reinterpret_cast<char*>(&pimpl_->pidHash), sizeof(pimpl_->pidHash), std::back_inserter(result));
|
||||
std::copy_n(reinterpret_cast<char*>(&pimpl_->isCuMemMapAlloc), sizeof(pimpl_->isCuMemMapAlloc),
|
||||
std::back_inserter(result));
|
||||
std::copy_n(reinterpret_cast<char*>(&pimpl_->transports), sizeof(pimpl_->transports), std::back_inserter(result));
|
||||
if (pimpl_->transportInfos.size() > static_cast<size_t>(std::numeric_limits<int8_t>::max())) {
|
||||
throw mscclpp::Error("Too many transport info entries", ErrorCode::InternalError);
|
||||
@@ -84,10 +154,17 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
|
||||
for (auto& entry : pimpl_->transportInfos) {
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.transport), sizeof(entry.transport), std::back_inserter(result));
|
||||
if (entry.transport == Transport::CudaIpc) {
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcBaseHandle), sizeof(entry.cudaIpcBaseHandle),
|
||||
std::back_inserter(result));
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcOffsetFromBase), sizeof(entry.cudaIpcOffsetFromBase),
|
||||
std::back_inserter(result));
|
||||
if (pimpl_->isCuMemMapAlloc) {
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
|
||||
std::back_inserter(result));
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.offsetFromBase), sizeof(entry.offsetFromBase),
|
||||
std::back_inserter(result));
|
||||
} else {
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcBaseHandle), sizeof(entry.cudaIpcBaseHandle),
|
||||
std::back_inserter(result));
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.cudaIpcOffsetFromBase), sizeof(entry.cudaIpcOffsetFromBase),
|
||||
std::back_inserter(result));
|
||||
}
|
||||
} else if (AllIBTransports.has(entry.transport)) {
|
||||
std::copy_n(reinterpret_cast<char*>(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result));
|
||||
} else {
|
||||
@@ -111,6 +188,8 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
|
||||
it += sizeof(this->hostHash);
|
||||
std::copy_n(it, sizeof(this->pidHash), reinterpret_cast<char*>(&this->pidHash));
|
||||
it += sizeof(this->pidHash);
|
||||
std::copy_n(it, sizeof(this->isCuMemMapAlloc), reinterpret_cast<char*>(&this->isCuMemMapAlloc));
|
||||
it += sizeof(this->isCuMemMapAlloc);
|
||||
std::copy_n(it, sizeof(this->transports), reinterpret_cast<char*>(&this->transports));
|
||||
it += sizeof(this->transports);
|
||||
int8_t transportCount;
|
||||
@@ -121,12 +200,19 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
|
||||
std::copy_n(it, sizeof(transportInfo.transport), reinterpret_cast<char*>(&transportInfo.transport));
|
||||
it += sizeof(transportInfo.transport);
|
||||
if (transportInfo.transport == Transport::CudaIpc) {
|
||||
std::copy_n(it, sizeof(transportInfo.cudaIpcBaseHandle),
|
||||
reinterpret_cast<char*>(&transportInfo.cudaIpcBaseHandle));
|
||||
it += sizeof(transportInfo.cudaIpcBaseHandle);
|
||||
std::copy_n(it, sizeof(transportInfo.cudaIpcOffsetFromBase),
|
||||
reinterpret_cast<char*>(&transportInfo.cudaIpcOffsetFromBase));
|
||||
it += sizeof(transportInfo.cudaIpcOffsetFromBase);
|
||||
if (this->isCuMemMapAlloc) {
|
||||
std::copy_n(it, sizeof(transportInfo.shareableHandle), reinterpret_cast<char*>(&transportInfo.shareableHandle));
|
||||
it += sizeof(transportInfo.shareableHandle);
|
||||
std::copy_n(it, sizeof(transportInfo.offsetFromBase), reinterpret_cast<char*>(&transportInfo.offsetFromBase));
|
||||
it += sizeof(transportInfo.offsetFromBase);
|
||||
} else {
|
||||
std::copy_n(it, sizeof(transportInfo.cudaIpcBaseHandle),
|
||||
reinterpret_cast<char*>(&transportInfo.cudaIpcBaseHandle));
|
||||
it += sizeof(transportInfo.cudaIpcBaseHandle);
|
||||
std::copy_n(it, sizeof(transportInfo.cudaIpcOffsetFromBase),
|
||||
reinterpret_cast<char*>(&transportInfo.cudaIpcOffsetFromBase));
|
||||
it += sizeof(transportInfo.cudaIpcOffsetFromBase);
|
||||
}
|
||||
} else if (AllIBTransports.has(transportInfo.transport)) {
|
||||
std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast<char*>(&transportInfo.ibMrInfo));
|
||||
it += sizeof(transportInfo.ibMrInfo);
|
||||
@@ -148,8 +234,18 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
|
||||
// The memory is local to the machine but not to the process, so we need to open the CUDA IPC handle
|
||||
auto entry = getTransportInfo(Transport::CudaIpc);
|
||||
void* base;
|
||||
MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
|
||||
this->data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
|
||||
if (this->isCuMemMapAlloc) {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsCompatibleMemHandleType()));
|
||||
size_t gran = getRecommendedGranularity();
|
||||
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&base, this->size, gran, 0, 0));
|
||||
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)base, this->size, 0, handle, 0));
|
||||
setReadWriteMemoryAccess(base, this->size);
|
||||
this->data = static_cast<char*>(base) + entry.offsetFromBase;
|
||||
} else {
|
||||
MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
|
||||
this->data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
|
||||
}
|
||||
INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data);
|
||||
} else {
|
||||
// No valid data pointer can be set
|
||||
@@ -161,11 +257,22 @@ RegisteredMemory::Impl::~Impl() {
|
||||
// Close the CUDA IPC handle if it was opened during deserialization
|
||||
if (data && transports.has(Transport::CudaIpc) && getHostHash() == this->hostHash && getPidHash() != this->pidHash) {
|
||||
void* base = static_cast<char*>(data) - getTransportInfo(Transport::CudaIpc).cudaIpcOffsetFromBase;
|
||||
cudaError_t err = cudaIpcCloseMemHandle(base);
|
||||
if (err != cudaSuccess) {
|
||||
WARN("Failed to close CUDA IPC handle at pointer %p: %s", base, cudaGetErrorString(err));
|
||||
if (this->isCuMemMapAlloc) {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
MSCCLPP_CULOG_WARN(cuMemRetainAllocationHandle(&handle, base));
|
||||
MSCCLPP_CULOG_WARN(cuMemRelease(handle));
|
||||
MSCCLPP_CULOG_WARN(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)base));
|
||||
MSCCLPP_CULOG_WARN(cuMemUnmap((CUdeviceptr)base, size));
|
||||
MSCCLPP_CULOG_WARN(cuMemRelease(handle));
|
||||
MSCCLPP_CULOG_WARN(cuMemAddressFree((CUdeviceptr)base, size));
|
||||
} else {
|
||||
INFO(MSCCLPP_P2P, "Closed CUDA IPC handle at pointer %p", base);
|
||||
cudaError_t err = cudaIpcCloseMemHandle(base);
|
||||
if (err != cudaSuccess) {
|
||||
WARN("Failed to close CUDA IPC handle at pointer %p: %s", base, cudaGetErrorString(err));
|
||||
} else {
|
||||
INFO(MSCCLPP_P2P, "Closed CUDA IPC handle at pointer %p", base);
|
||||
}
|
||||
}
|
||||
data = nullptr;
|
||||
}
|
||||
|
||||
19
src/utils.cc
19
src/utils.cc
@@ -68,12 +68,19 @@ std::string getHostName(int maxlen, const char delim) {
|
||||
}
|
||||
|
||||
bool isNvlsSupported() {
|
||||
#if (CUDART_VERSION >= 12010)
|
||||
CUdevice dev;
|
||||
int isNvlsSupported;
|
||||
MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isNvlsSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
|
||||
return isNvlsSupported == 1;
|
||||
[[maybe_unused]] static bool result = false;
|
||||
[[maybe_unused]] static bool isChecked = false;
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
if (!isChecked) {
|
||||
int isMulticastSupported;
|
||||
int isFabricSupported;
|
||||
CUdevice dev;
|
||||
MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
|
||||
result = (isMulticastSupported == 1 && isFabricSupported == 1);
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
1458
test/execution-files/allreduce_nvls.json
Normal file
1458
test/execution-files/allreduce_nvls.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -131,7 +131,11 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
|
||||
mscclpp::ExecutionPlan plan(executionPlanName, executionPlanPath);
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
std::shared_ptr<char> sendbuff = mscclpp::allocSharedPhysicalCuda<char>(bufferSize);
|
||||
#else
|
||||
std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
|
||||
#endif
|
||||
std::vector<int> dataHost(bufferSize / sizeof(int), rank);
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice));
|
||||
double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType);
|
||||
|
||||
@@ -1,18 +1,17 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#if (USE_NVLS)
|
||||
#include <cuda.h>
|
||||
#include <cudaTypedefs.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <mpi.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <mscclpp/gpu.hpp>
|
||||
#if (CUDA_NVLS_SUPPORTED)
|
||||
#include <cuda.h>
|
||||
#include <cudaTypedefs.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define CUCHECK(cmd) \
|
||||
do { \
|
||||
@@ -41,31 +40,31 @@
|
||||
#define MULTIMEM_LD(val, ptr)
|
||||
#endif
|
||||
|
||||
__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) {
|
||||
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
|
||||
__global__ void init_kernel(float* uc_ptr, size_t size, int myrank, int nranks) {
|
||||
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
|
||||
uc_ptr[idx] = myrank + idx;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) {
|
||||
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
|
||||
__global__ void check_correctness(float* uc_ptr, size_t size, int myrank, int nranks) {
|
||||
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
|
||||
float expected = (float)((nranks * (nranks - 1)) / 2 + nranks * idx);
|
||||
if (abs(uc_ptr[idx] - expected) > 0.01 * expected) {
|
||||
printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected);
|
||||
printf("error! idx %ld: %f != %f\n", idx, uc_ptr[idx], expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) {
|
||||
__global__ void testing(float* mc_ptr, size_t size, int myrank, int nranks) {
|
||||
// for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
|
||||
// line is assumed to be 16B 4 ints of 8 halves
|
||||
int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
|
||||
int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
|
||||
size_t my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
|
||||
size_t my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
|
||||
|
||||
int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
|
||||
int my_step = blockDim.x * gridDim.x * 4;
|
||||
size_t my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
|
||||
size_t my_step = blockDim.x * gridDim.x * 4;
|
||||
|
||||
for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
|
||||
for (size_t idx = my_st + my_offset; idx < my_en; idx += my_step) {
|
||||
[[maybe_unused]] uint4 val;
|
||||
MULTIMEM_LD(val, mc_ptr + idx);
|
||||
MULTIMEM_ST(val, mc_ptr + idx);
|
||||
@@ -80,7 +79,7 @@ int main() {
|
||||
|
||||
cudaSetDevice(myrank);
|
||||
|
||||
size_t size = 1024 * 1024 * 512;
|
||||
size_t size = 1024ULL * 1024ULL * 512ULL * 16;
|
||||
CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
|
||||
CUmulticastObjectProp mcProp = {};
|
||||
@@ -138,7 +137,7 @@ int main() {
|
||||
prop.requestedHandleTypes = handleType;
|
||||
|
||||
// allocate physical memory (data buffer)
|
||||
CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
|
||||
CUCHECK(cuMemCreate(&memhandle, mcSize, &prop, 0 /*flags*/));
|
||||
|
||||
void* uc_va;
|
||||
void* mc_va;
|
||||
@@ -148,14 +147,14 @@ int main() {
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
|
||||
// Map a VA to UC space
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
|
||||
cudaMemset(uc_va, 0, size);
|
||||
CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, mcSize, minGran, 0U, 0));
|
||||
cudaMemset(uc_va, 0, mcSize);
|
||||
CUCHECK(cuMemMap((CUdeviceptr)uc_va, mcSize, 0, memhandle, 0));
|
||||
// set access on UC address
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, mcSize, &accessDesc, 1));
|
||||
|
||||
// everyone binds memory to the multicast
|
||||
CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
|
||||
CUCHECK(cuMulticastBindAddr(handle, 0 /*mcOffset*/, (CUdeviceptr)uc_va, mcSize, 0));
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
// usual VA business: map both MC and PA to two different VA addresses
|
||||
|
||||
@@ -203,11 +202,11 @@ int main() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else // !(USE_NVLS)
|
||||
#else // !(CUDA_NVLS_SUPPORTED)
|
||||
|
||||
int main() {
|
||||
printf("This test requires NVLS to be enabled\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // !(USE_NVLS)
|
||||
#endif // !(CUDA_NVLS_SUPPORTED)
|
||||
|
||||
Reference in New Issue
Block a user