mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
Major enhancements to the IB signal forwarding mechanisms (`host-no-atomic` mode), primarily adding support for GDRCopy and MLX5 Direct Verbs, and refactoring the signal forwarding path for IB HostNoAtomic mode. The changes fix memory consistency issues and reduce signaling latency. - GDRCopy and MLX5 Direct Verbs MR integration - Signal forwarding path redesign - Semaphore and connection API updates - Environment (`MSCCLPP_FORCE_DISABLE_GDR`) and documentation updates
72 lines
2.5 KiB
C++
72 lines
2.5 KiB
C++
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#ifndef MSCCLPP_ATOMIC_DEVICE_HPP_
|
|
#define MSCCLPP_ATOMIC_DEVICE_HPP_
|
|
|
|
#include "device.hpp"
|
|
|
|
#if defined(MSCCLPP_DEVICE_CUDA)
|
|
#include <cuda/atomic>
|
|
#endif // defined(MSCCLPP_DEVICE_CUDA)
|
|
|
|
namespace mscclpp {
|
|
|
|
#if defined(MSCCLPP_DEVICE_CUDA)
|
|
|
|
constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed;
|
|
constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire;
|
|
constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release;
|
|
constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel;
|
|
constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst;
|
|
|
|
constexpr cuda::thread_scope scopeSystem = cuda::thread_scope_system;
|
|
constexpr cuda::thread_scope scopeDevice = cuda::thread_scope_device;
|
|
|
|
template <typename T, cuda::thread_scope Scope = cuda::thread_scope_system>
|
|
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) {
|
|
return cuda::atomic_ref<T, Scope>{*ptr}.load(memoryOrder);
|
|
}
|
|
|
|
template <typename T, cuda::thread_scope Scope = cuda::thread_scope_system>
|
|
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) {
|
|
cuda::atomic_ref<T, Scope>{*ptr}.store(val, memoryOrder);
|
|
}
|
|
|
|
template <typename T, cuda::thread_scope Scope = cuda::thread_scope_system>
|
|
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) {
|
|
return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
|
|
}
|
|
|
|
#else // !defined(MSCCLPP_DEVICE_CUDA)
|
|
|
|
constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
|
|
constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
|
|
constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
|
|
constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
|
|
constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
|
|
|
|
constexpr auto scopeSystem = 0;
|
|
constexpr auto scopeDevice = 0;
|
|
|
|
template <typename T, int scope = scopeSystem>
|
|
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) {
|
|
return __atomic_load_n(ptr, memoryOrder);
|
|
}
|
|
|
|
template <typename T, int scope = scopeSystem>
|
|
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) {
|
|
__atomic_store_n(ptr, val, memoryOrder);
|
|
}
|
|
|
|
template <typename T, int scope = scopeSystem>
|
|
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) {
|
|
return __atomic_fetch_add(ptr, val, memoryOrder);
|
|
}
|
|
|
|
#endif // !defined(MSCCLPP_DEVICE_CUDA)
|
|
|
|
} // namespace mscclpp
|
|
|
|
#endif // MSCCLPP_ATOMIC_DEVICE_HPP_
|