mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
FIFO improvements (#557)
* Revert `MSCCLPP_FIFO_USE_TAIL_REPLICA=1` back to the default. * Optimize `FifoDeviceHandle`. * Do not use `cudaHostAllocWriteCombined` that increases latency. * Pin host memory for `Host2DeviceSemaphore::outboundSemaphore_`. * Fix proxy NUMA binding issues. * Prevent graph capture inside proxy threads. * Now `CudaIpcConnection` skips stream sync when unnecessary. * Now any type of connection needs to hold a shared pointer to the context for memory safety. * Now a context should be always managed by a shared pointer for memory safety. * Minor docs & interface improvements. * Minor fix in `mscclpp-test` correctness test.
This commit is contained in:
@@ -63,36 +63,22 @@ else()
|
||||
endif()
|
||||
|
||||
if(MSCCLPP_GPU_ARCHS)
|
||||
# Remove any leading/trailing whitespace
|
||||
string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
|
||||
|
||||
# Split the string into a list
|
||||
string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
|
||||
string(REPLACE "," ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
|
||||
|
||||
# Check if the list is empty
|
||||
if(NOT MSCCLPP_GPU_ARCHS)
|
||||
message(FATAL_ERROR "MSCCLPP_GPU_ARCHS is given empty. Please specify GPU architectures or do not set MSCCLPP_GPU_ARCHS.")
|
||||
message(FATAL_ERROR "MSCCLPP_GPU_ARCHS is empty. Specify GPU architectures or leave unset.")
|
||||
endif()
|
||||
elseif(MSCCLPP_USE_CUDA)
|
||||
# CUDA 11 or higher is required
|
||||
if(CUDAToolkit_VERSION_MAJOR LESS 11)
|
||||
message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}")
|
||||
if(CUDAToolkit_VERSION VERSION_LESS "11.8")
|
||||
message(FATAL_ERROR "CUDA 11.8 or higher required, found ${CUDAToolkit_VERSION}")
|
||||
endif()
|
||||
|
||||
# Ampere architecture
|
||||
if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11)
|
||||
set(MSCCLPP_GPU_ARCHS 80)
|
||||
set(MSCCLPP_GPU_ARCHS 80)
|
||||
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
list(APPEND MSCCLPP_GPU_ARCHS 90)
|
||||
endif()
|
||||
|
||||
# Hopper architecture
|
||||
if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
|
||||
set(MSCCLPP_GPU_ARCHS ${MSCCLPP_GPU_ARCHS} 90)
|
||||
endif()
|
||||
|
||||
# Blackwell architecture
|
||||
if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12 AND CUDAToolkit_VERSION_MINOR GREATER_EQUAL 8)
|
||||
set(MSCCLPP_GPU_ARCHS ${MSCCLPP_GPU_ARCHS} 100)
|
||||
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
list(APPEND MSCCLPP_GPU_ARCHS 100)
|
||||
endif()
|
||||
elseif(MSCCLPP_USE_ROCM)
|
||||
set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942)
|
||||
|
||||
@@ -127,8 +127,8 @@ class TcpBootstrap : public Bootstrap {
|
||||
/// @return The unique ID stored in the TcpBootstrap.
|
||||
UniqueId getUniqueId() const;
|
||||
|
||||
/// Initialize the TcpBootstrap with a given unique ID. The unique ID can be generated by any methods;
|
||||
/// it can be created by createUniqueId() or can be any arbitrary bit arrays provided by the user.
|
||||
/// Initialize the TcpBootstrap with a given unique ID. The unique ID can be generated by any method;
|
||||
/// it can be created by createUniqueId() or can be any arbitrary bit array provided by the user.
|
||||
/// @param uniqueId The unique ID to initialize the TcpBootstrap with.
|
||||
/// @param timeoutSec The connection timeout in seconds.
|
||||
void initialize(UniqueId uniqueId, int64_t timeoutSec = 30);
|
||||
@@ -453,7 +453,7 @@ class Endpoint {
|
||||
/// @return A vector of characters representing the serialized Endpoint object.
|
||||
std::vector<char> serialize();
|
||||
|
||||
/// Deserialize a Endpoint object from a vector of characters.
|
||||
/// Deserialize an Endpoint object from a vector of characters.
|
||||
///
|
||||
/// @param data A vector of characters representing a serialized Endpoint object.
|
||||
/// @return A deserialized Endpoint object.
|
||||
@@ -473,8 +473,10 @@ class Connection {
|
||||
public:
|
||||
/// Constructor.
|
||||
/// @param maxWriteQueueSize The maximum number of write requests that can be queued.
|
||||
Connection(int maxWriteQueueSize) : maxWriteQueueSize(maxWriteQueueSize){};
|
||||
Connection(std::shared_ptr<Context> context, int maxWriteQueueSize)
|
||||
: context_(context), maxWriteQueueSize_(maxWriteQueueSize){};
|
||||
|
||||
/// Destructor.
|
||||
virtual ~Connection() = default;
|
||||
|
||||
/// Write data from a source RegisteredMemory to a destination RegisteredMemory.
|
||||
@@ -487,7 +489,7 @@ class Connection {
|
||||
virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
|
||||
uint64_t size) = 0;
|
||||
|
||||
/// Update a 8-byte value in a destination RegisteredMemory and synchronize the change with the remote process.
|
||||
/// Update an 8-byte value in a destination RegisteredMemory and synchronize the change with the remote process.
|
||||
///
|
||||
/// @param dst The destination RegisteredMemory.
|
||||
/// @param dstOffset The offset in bytes from the start of the destination RegisteredMemory.
|
||||
@@ -522,7 +524,9 @@ class Connection {
|
||||
// Internal methods for getting implementation pointers.
|
||||
static std::shared_ptr<RegisteredMemory::Impl> getImpl(RegisteredMemory& memory);
|
||||
static std::shared_ptr<Endpoint::Impl> getImpl(Endpoint& memory);
|
||||
int maxWriteQueueSize;
|
||||
|
||||
std::shared_ptr<Context> context_;
|
||||
int maxWriteQueueSize_;
|
||||
};
|
||||
|
||||
/// Used to configure an endpoint.
|
||||
@@ -567,19 +571,19 @@ struct EndpointConfig {
|
||||
/// 1. The client creates an endpoint with createEndpoint() and sends it to the server.
|
||||
/// 2. The server receives the client endpoint, creates its own endpoint with createEndpoint(), sends it to the
|
||||
/// client, and creates a connection with connect().
|
||||
/// 4. The client receives the server endpoint, creates a connection with connect() and sends a
|
||||
/// 3. The client receives the server endpoint, creates a connection with connect() and sends a
|
||||
/// RegisteredMemory to the server.
|
||||
/// 5. The server receives the RegisteredMemory and writes to it using the previously created connection.
|
||||
/// The client waiting to create a connection before sending the RegisteredMemory ensures that the server can not
|
||||
/// 4. The server receives the RegisteredMemory and writes to it using the previously created connection.
|
||||
/// The client waiting to create a connection before sending the RegisteredMemory ensures that the server cannot
|
||||
/// write to the RegisteredMemory before the connection is established.
|
||||
///
|
||||
/// While some transports may have more relaxed implementation behavior, this should not be relied upon.
|
||||
class Context {
|
||||
class Context : public std::enable_shared_from_this<Context> {
|
||||
public:
|
||||
/// Create a context.
|
||||
Context();
|
||||
/// Create a new Context instance.
|
||||
static std::shared_ptr<Context> create() { return std::shared_ptr<Context>(new Context()); }
|
||||
|
||||
/// Destroy the context.
|
||||
/// Destructor.
|
||||
~Context();
|
||||
|
||||
/// Register a region of GPU memory for use in this context.
|
||||
@@ -606,6 +610,8 @@ class Context {
|
||||
std::shared_ptr<Connection> connect(Endpoint localEndpoint, Endpoint remoteEndpoint);
|
||||
|
||||
private:
|
||||
Context();
|
||||
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> pimpl_;
|
||||
|
||||
@@ -620,7 +626,7 @@ using NonblockingFuture [[deprecated("Use std::shared_future instead. This will
|
||||
/// A class that sets up all registered memories and connections between processes.
|
||||
///
|
||||
/// A typical way to use this class:
|
||||
/// 1. Call connect() to declare connections between the calling process with other processes.
|
||||
/// 1. Call connect() to declare connections between the calling process and other processes.
|
||||
/// 2. Call registerMemory() to register memory regions that will be used for communication.
|
||||
/// 3. Call sendMemory() or recvMemory() to send/receive registered memory regions to/from
|
||||
/// other processes.
|
||||
@@ -670,7 +676,7 @@ using NonblockingFuture [[deprecated("Use std::shared_future instead. This will
|
||||
/// auto connection = communicator.connect(0, tag, Transport::CudaIpc); // undefined behavior
|
||||
/// communicator.sendMemory(memory1, 0, tag);
|
||||
/// ```
|
||||
/// In the wrong example, the connection information from rank 1 will be sent to `mem1` object on rank 0,
|
||||
/// In the wrong example, the connection information from rank 1 will be sent to the `mem1` object on rank 0,
|
||||
/// where the object type is RegisteredMemory, not Connection.
|
||||
///
|
||||
class Communicator {
|
||||
@@ -762,7 +768,7 @@ class Communicator {
|
||||
/// the first get() on the future.
|
||||
/// Note that this function is two-way and a connection from rank `i` to remote rank `j` needs
|
||||
/// to have a counterpart from rank `j` to rank `i`. Note that with IB, buffers are registered at a page level and if
|
||||
/// a buffer is spread through multiple pages and do not fully utilize all of them, IB's QP has to register for all
|
||||
/// a buffer is spread through multiple pages and does not fully utilize all of them, IB's QP has to register for all
|
||||
/// involved pages. This potentially has security risks if the connection's accesses are given to a malicious process.
|
||||
///
|
||||
/// Multiple calls to either sendMemory() or connect() with the same @p remoteRank and @p tag will be ordered by
|
||||
@@ -818,11 +824,11 @@ extern const TransportFlags AllIBTransports;
|
||||
/// A constant TransportFlags object representing all transports.
|
||||
extern const TransportFlags AllTransports;
|
||||
|
||||
/// A type which could be safely used in device side.
|
||||
/// A type which could be safely used on the device side.
|
||||
template <class T>
|
||||
using DeviceHandle = typename T::DeviceHandle;
|
||||
|
||||
/// Retrieve the deviceHandle instance from host object.
|
||||
/// Retrieve the deviceHandle instance from a host object.
|
||||
template <typename T>
|
||||
DeviceHandle<std::remove_reference_t<T>> deviceHandle(T&& t) {
|
||||
return t.deviceHandle();
|
||||
|
||||
@@ -93,7 +93,7 @@ class Env {
|
||||
/// Env name: `MSCCLPP_FIFO_USE_TAIL_REPLICA`. If set to true, it will replicate the FIFO tail on the GPU memory,
|
||||
/// which makes the GPU poll on the tail faster, but requires a periodic FIFO flush to update the replica on the GPU.
|
||||
/// If set to false, the GPU will directly read the tail from the host memory, which is slower but does not require
|
||||
/// periodic flushes. Default is false.
|
||||
/// periodic flushes. Default is true.
|
||||
const bool fifoUseTailReplica;
|
||||
|
||||
private:
|
||||
|
||||
@@ -4,51 +4,46 @@
|
||||
#ifndef MSCCLPP_FIFO_HPP_
|
||||
#define MSCCLPP_FIFO_HPP_
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
|
||||
#include "fifo_device.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
constexpr size_t DEFAULT_FIFO_SIZE = 128;
|
||||
constexpr size_t DEFAULT_FIFO_SIZE = 512;
|
||||
|
||||
/// A class representing a host proxy FIFO that can consume work elements pushed by device threads.
|
||||
/// Host-side proxy FIFO for device-produced work elements.
|
||||
class Fifo {
|
||||
public:
|
||||
/// Constructs a new Fifo object.
|
||||
/// @param size The number of entires in the FIFO.
|
||||
/// Constructor.
|
||||
/// @param size Number of entries (default: DEFAULT_FIFO_SIZE).
|
||||
Fifo(int size = DEFAULT_FIFO_SIZE);
|
||||
|
||||
/// Destroys the Fifo object.
|
||||
/// Destructor.
|
||||
~Fifo();
|
||||
|
||||
/// Polls the FIFO for a trigger.
|
||||
///
|
||||
/// Returns ProxyTrigger which is the trigger at the head of fifo.
|
||||
/// Poll and get the trigger at the head.
|
||||
/// @return ProxyTrigger at the head of the FIFO.
|
||||
ProxyTrigger poll();
|
||||
|
||||
/// Pops a trigger from the FIFO.
|
||||
/// Remove the head trigger.
|
||||
void pop();
|
||||
|
||||
/// Flushes the tail of the FIFO.
|
||||
///
|
||||
/// @param sync If true, waits for the flush to complete before returning.
|
||||
void flushTail(bool sync = false);
|
||||
|
||||
/// Return the FIFO size.
|
||||
/// @return The FIFO size.
|
||||
/// Get FIFO size.
|
||||
/// @return Number of entries in the FIFO.
|
||||
int size() const;
|
||||
|
||||
/// Returns a FifoDeviceHandle object representing the device FIFO.
|
||||
///
|
||||
/// @return A FifoDeviceHandle object representing the device FIFO.
|
||||
/// Get device-side FIFO handle.
|
||||
/// @return FifoDeviceHandle for device access.
|
||||
FifoDeviceHandle deviceHandle() const;
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> pimpl;
|
||||
std::unique_ptr<Impl> pimpl_;
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -15,7 +15,11 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
/// A struct representing a pair of 64-bit unsigned integers used as a trigger for the proxy.
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
MSCCLPP_DEVICE_INLINE uint64_t hostLoadRelaxed(uint64_t* ptr) { return atomicLoad(ptr, memoryOrderRelaxed); }
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
|
||||
/// Pair of 64-bit unsigned integers used as a trigger for the proxy.
|
||||
///
|
||||
/// This struct is used as a work element in the concurrent FIFO where multiple device threads can push
|
||||
/// ProxyTrigger elements and a single host proxy thread consumes these work elements.
|
||||
@@ -45,68 +49,63 @@ struct alignas(16) ProxyTrigger {
|
||||
struct FifoDeviceHandle {
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
/// Push a trigger to the FIFO.
|
||||
///
|
||||
/// @param trigger The trigger to push.
|
||||
/// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
|
||||
/// @return The new head of the FIFO.
|
||||
/// @param trigger Trigger to push.
|
||||
/// @param maxSpinCount Max spin count before assert. Never assert if negative.
|
||||
/// @return Previous head of the FIFO where the trigger was pushed.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t push(ProxyTrigger trigger, [[maybe_unused]] int64_t maxSpinCount = 1000000) {
|
||||
uint64_t curFifoHead = atomicFetchAdd(this->head, (uint64_t)1, memoryOrderRelaxed);
|
||||
uint64_t prevHead = atomicFetchAdd<uint64_t, scopeDevice>(head, 1, memoryOrderRelaxed);
|
||||
|
||||
// make the last bit intentionally non-zero so that we can safely poll. Don't worry, we will change it back in host
|
||||
// side
|
||||
trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
|
||||
// Flip the last bit for safe polling; host will revert.
|
||||
constexpr uint64_t flipMask = uint64_t{1} << uint64_t{63};
|
||||
trigger.snd ^= flipMask;
|
||||
|
||||
// Only one of two conditions need to be met to proceed. Either the tail has advanced enough or where we need to
|
||||
// write to is 0. However, the first condition is faster to check since the tail is flushed periodically anyways but
|
||||
// for the second condition we need to read CPU memory.
|
||||
// As atomic access is slow, we first check using the bare pointer and then use the atomic load if the
|
||||
// condition is not met.
|
||||
if (curFifoHead >= size + *(this->tailReplica)) {
|
||||
OR_POLL_MAYBE_JAILBREAK((curFifoHead >= size + atomicLoad(this->tailReplica, memoryOrderRelaxed)),
|
||||
(atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
|
||||
maxSpinCount);
|
||||
if (prevHead >= size + *tailReplica) {
|
||||
OR_POLL_MAYBE_JAILBREAK((prevHead >= size + atomicLoad(tailReplica, memoryOrderRelaxed)),
|
||||
(hostLoadRelaxed(&(triggers[prevHead % size].fst)) != 0), maxSpinCount);
|
||||
}
|
||||
|
||||
ProxyTrigger* triggerPtr = &(this->triggers[curFifoHead % size]);
|
||||
ProxyTrigger* triggerPtr = &(triggers[prevHead % size]);
|
||||
|
||||
// Make sure the data is visible to the host before we update the tail.
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
#if __CUDA_ARCH__ == 800
|
||||
// For A100, threadfence_system is more efficient than release
|
||||
// This is faster than release for A100.
|
||||
__threadfence_system();
|
||||
asm volatile("st.global.relaxed.sys.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
|
||||
#else
|
||||
asm volatile("st.global.release.sys.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
|
||||
#endif
|
||||
#else // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
// store snd no later than fst.
|
||||
// Store snd no later than fst.
|
||||
atomicStore(&(triggerPtr->snd), trigger.snd, memoryOrderRelaxed);
|
||||
atomicStore(&(triggerPtr->fst), trigger.fst, memoryOrderRelease);
|
||||
#endif // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
|
||||
return curFifoHead;
|
||||
return prevHead;
|
||||
}
|
||||
|
||||
/// Wait until there is a place in the FIFO to push a trigger.
|
||||
///
|
||||
/// @param curFifoHead The current head of the FIFO.
|
||||
/// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
|
||||
MSCCLPP_DEVICE_INLINE void sync(uint64_t curFifoHead, [[maybe_unused]] int64_t maxSpinCount = 1000000) {
|
||||
// Same as push but in this case checking the fist condition is probably faster since for tail to be pushed we need
|
||||
/// Wait until a specific trigger is popped from the FIFO.
|
||||
/// @param fifoHead FIFO head where the trigger was pushed.
|
||||
/// @param maxSpinCount Max spin count before assert. Never assert if negative.
|
||||
MSCCLPP_DEVICE_INLINE void sync(uint64_t fifoHead, [[maybe_unused]] int64_t maxSpinCount = 1000000) {
|
||||
// Same as push but in this case checking the first condition is probably faster since for tail to be pushed we need
|
||||
// to wait for cudaMemcpy to be done.
|
||||
OR_POLL_MAYBE_JAILBREAK((curFifoHead >= atomicLoad(this->tailReplica, memoryOrderRelaxed)),
|
||||
(atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
|
||||
maxSpinCount);
|
||||
OR_POLL_MAYBE_JAILBREAK((fifoHead >= atomicLoad(tailReplica, memoryOrderRelaxed)),
|
||||
(hostLoadRelaxed(&(triggers[fifoHead % size].fst)) != 0), maxSpinCount);
|
||||
}
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
|
||||
/// The FIFO buffer that is allocated on the host via `cudaHostAlloc()`.
|
||||
/// FIFO buffer on host.
|
||||
ProxyTrigger* triggers;
|
||||
/// Replica of the FIFO tail.
|
||||
uint64_t* tailReplica;
|
||||
/// The FIFO head. Allocated on the device and only accessed by the device.
|
||||
/// FIFO head on device.
|
||||
uint64_t* head;
|
||||
/// The FIFO size.
|
||||
/// FIFO tail replica on device.
|
||||
uint64_t* tailReplica;
|
||||
/// FIFO size.
|
||||
int size;
|
||||
};
|
||||
|
||||
|
||||
@@ -123,7 +123,7 @@ namespace detail {
|
||||
void setReadWriteMemoryAccess(void* base, size_t size);
|
||||
|
||||
void* gpuCalloc(size_t bytes);
|
||||
void* gpuCallocHost(size_t bytes);
|
||||
void* gpuCallocHost(size_t bytes, unsigned int flags);
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
void* gpuCallocUncached(size_t bytes);
|
||||
#endif // defined(__HIP_PLATFORM_AMD__)
|
||||
@@ -206,13 +206,13 @@ auto gpuCallocUnique(size_t nelems = 1) {
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocHostShared(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocHost, nelems);
|
||||
auto gpuCallocHostShared(size_t nelems = 1, unsigned int flags = cudaHostAllocMapped) {
|
||||
return detail::safeAlloc<T, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocHost, nelems, flags);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto gpuCallocHostUnique(size_t nelems = 1) {
|
||||
return detail::safeAlloc<T, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(detail::gpuCallocHost, nelems);
|
||||
auto gpuCallocHostUnique(size_t nelems = 1, unsigned int flags = cudaHostAllocMapped) {
|
||||
return detail::safeAlloc<T, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(detail::gpuCallocHost, nelems, flags);
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
@@ -35,12 +35,6 @@ struct BaseMemoryChannelDeviceHandle {
|
||||
///
|
||||
MSCCLPP_DEVICE_INLINE void relaxedSignal() { semaphore_.relaxedSignal(); }
|
||||
|
||||
/// Increase the counter of the local semaphore.
|
||||
MSCCLPP_DEVICE_INLINE void semaphoreIncrement() { semaphore_.semaphoreIncrement(); }
|
||||
|
||||
/// Read the counter of the local semaphore.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t semaphoreGetLocal() const { return semaphore_.semaphoreGetLocal(); }
|
||||
|
||||
/// Check if the remote semaphore has signaled.
|
||||
/// @return true if the remote semaphore has signaled.
|
||||
MSCCLPP_DEVICE_INLINE bool poll() { return semaphore_.poll(); }
|
||||
|
||||
@@ -27,8 +27,8 @@ class BaseProxyService {
|
||||
class ProxyService : public BaseProxyService {
|
||||
public:
|
||||
/// Constructor.
|
||||
/// @param fifoSize The size of the FIFO used by the proxy service. Default is DEFAULT_FIFO_SIZE.
|
||||
ProxyService(size_t fifoSize = DEFAULT_FIFO_SIZE);
|
||||
/// @param fifoSize Size of the FIFO used by the proxy service (default: DEFAULT_FIFO_SIZE).
|
||||
ProxyService(int fifoSize = DEFAULT_FIFO_SIZE);
|
||||
|
||||
/// Build and add a semaphore to the proxy service.
|
||||
/// @param connection The connection associated with the semaphore.
|
||||
@@ -72,10 +72,7 @@ class ProxyService : public BaseProxyService {
|
||||
std::vector<std::shared_ptr<Host2DeviceSemaphore>> semaphores_;
|
||||
std::vector<RegisteredMemory> memories_;
|
||||
std::shared_ptr<Proxy> proxy_;
|
||||
int deviceNumaNode;
|
||||
std::unordered_map<std::shared_ptr<Connection>, int> inflightRequests;
|
||||
|
||||
void bindThread();
|
||||
std::unordered_map<std::shared_ptr<Connection>, int> inflightRequests_;
|
||||
|
||||
ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw);
|
||||
};
|
||||
|
||||
@@ -11,53 +11,51 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
/// Possible return values of a ProxyHandler.
|
||||
/// Return values for ProxyHandler.
|
||||
enum class ProxyHandlerResult {
|
||||
/// Move to the next trigger in the FIFO.
|
||||
/// Move to next trigger in FIFO.
|
||||
Continue,
|
||||
/// Flush the FIFO and continue to the next trigger.
|
||||
/// Flush the FIFO and move to next trigger.
|
||||
FlushFifoTailAndContinue,
|
||||
/// Stop the proxy and exit.
|
||||
/// Stop and exit proxy.
|
||||
Stop,
|
||||
};
|
||||
|
||||
class Proxy;
|
||||
|
||||
/// Type of handler function for the proxy.
|
||||
/// Handler function type for proxy.
|
||||
using ProxyHandler = std::function<ProxyHandlerResult(ProxyTrigger)>;
|
||||
|
||||
/// Host-side proxy for PortChannels.
|
||||
class Proxy {
|
||||
public:
|
||||
/// Constructor of Proxy.
|
||||
/// @param handler The handler function to be called for each trigger in the FIFO.
|
||||
/// @param threadInit Optional function to be called in the proxy thread before starting the FIFO consumption.
|
||||
/// @param fifoSize The size of the FIFO. Default is DEFAULT_FIFO_SIZE.
|
||||
Proxy(ProxyHandler handler, std::function<void()> threadInit, size_t fifoSize = DEFAULT_FIFO_SIZE);
|
||||
/// Constructor.
|
||||
/// @param handler Handler for each FIFO trigger.
|
||||
/// @param threadInit Optional function run in proxy thread before FIFO consumption.
|
||||
/// @param fifoSize FIFO size (default: DEFAULT_FIFO_SIZE).
|
||||
Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize = DEFAULT_FIFO_SIZE);
|
||||
|
||||
/// Constructor of Proxy.
|
||||
/// @param handler The handler function to be called for each trigger in the FIFO.
|
||||
/// @param fifoSize The size of the FIFO. Default is DEFAULT_FIFO_SIZE.
|
||||
Proxy(ProxyHandler handler, size_t fifoSize = DEFAULT_FIFO_SIZE);
|
||||
/// Constructor.
|
||||
/// @param handler Handler for each FIFO trigger.
|
||||
/// @param fifoSize FIFO size (default: DEFAULT_FIFO_SIZE).
|
||||
Proxy(ProxyHandler handler, int fifoSize = DEFAULT_FIFO_SIZE);
|
||||
|
||||
/// Destructor of Proxy.
|
||||
/// This will stop the proxy if it is running.
|
||||
/// Destructor. Stops proxy if running.
|
||||
~Proxy();
|
||||
|
||||
/// Start the proxy.
|
||||
/// Start proxy.
|
||||
void start();
|
||||
|
||||
/// Stop the proxy.
|
||||
/// Stop proxy.
|
||||
void stop();
|
||||
|
||||
/// This is a concurrent fifo which is multiple threads from the device
|
||||
/// can produce for and the sole proxy thread consumes it.
|
||||
/// @return A reference to the FIFO object used by the proxy.
|
||||
Fifo& fifo();
|
||||
/// Get reference to FIFO used by proxy.
|
||||
/// @return Shared pointer to FIFO.
|
||||
std::shared_ptr<Fifo> fifo();
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> pimpl;
|
||||
std::unique_ptr<Impl> pimpl_;
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -64,7 +64,7 @@ class BaseSemaphore {
|
||||
};
|
||||
|
||||
/// A semaphore for sending signals from the host to the device.
|
||||
class Host2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, std::default_delete> {
|
||||
class Host2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, detail::GpuHostDeleter> {
|
||||
private:
|
||||
std::shared_ptr<Connection> connection_;
|
||||
|
||||
|
||||
@@ -19,16 +19,33 @@ struct Host2DeviceSemaphoreDeviceHandle {
|
||||
/// Poll if the host has signaled.
|
||||
/// @return true if the host has signaled.
|
||||
MSCCLPP_DEVICE_INLINE bool poll() {
|
||||
bool signaled = (atomicLoad(inboundSemaphoreId, memoryOrderAcquire) > (*expectedInboundSemaphoreId));
|
||||
if (signaled) (*expectedInboundSemaphoreId) += 1;
|
||||
bool signaled = (loadInbound() > loadExpectedInbound());
|
||||
if (signaled) incExpectedInbound();
|
||||
return signaled;
|
||||
}
|
||||
|
||||
/// Wait for the host to signal.
|
||||
MSCCLPP_DEVICE_INLINE void wait([[maybe_unused]] int64_t maxSpinCount = 100000000) {
|
||||
(*expectedInboundSemaphoreId) += 1;
|
||||
uint64_t flag = (*expectedInboundSemaphoreId);
|
||||
POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < flag), maxSpinCount);
|
||||
auto expected = incExpectedInbound();
|
||||
POLL_MAYBE_JAILBREAK((loadInbound() < expected), maxSpinCount);
|
||||
}
|
||||
|
||||
/// Thread-safe read of expected inbound value.
|
||||
/// @return The expected inbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t loadExpectedInbound() {
|
||||
return atomicLoad<uint64_t, scopeDevice>(expectedInboundSemaphoreId, memoryOrderRelaxed);
|
||||
}
|
||||
|
||||
/// Thread-safe increment of expected inbound value.
|
||||
/// @return The incremented expected inbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t incExpectedInbound() {
|
||||
return atomicFetchAdd<uint64_t, scopeDevice>(expectedInboundSemaphoreId, 1, memoryOrderRelaxed) + 1;
|
||||
}
|
||||
|
||||
/// Thread-safe read of inbound value.
|
||||
/// @return The inbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t loadInbound() {
|
||||
return atomicLoad<uint64_t, scopeSystem>(inboundSemaphoreId, memoryOrderAcquire);
|
||||
}
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
|
||||
@@ -43,67 +60,72 @@ struct Host2DeviceSemaphoreDeviceHandle {
|
||||
/// Device-side handle for MemoryDevice2DeviceSemaphore.
|
||||
struct MemoryDevice2DeviceSemaphoreDeviceHandle {
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
/// Poll if the remote device has signaled.
|
||||
/// @return true if the remote device has signaled.
|
||||
/// Poll if remote device has signaled.
|
||||
/// @return true if remote device has signaled.
|
||||
MSCCLPP_DEVICE_INLINE bool poll() {
|
||||
bool signaled = (atomicLoad(inboundSemaphoreId, memoryOrderAcquire) > (*expectedInboundSemaphoreId));
|
||||
if (signaled) (*expectedInboundSemaphoreId) += 1;
|
||||
bool signaled = (loadInbound() > loadExpectedInbound());
|
||||
if (signaled) incExpectedInbound();
|
||||
return signaled;
|
||||
}
|
||||
|
||||
/// Wait for the remote device to signal.
|
||||
/// Wait for remote device to signal.
|
||||
MSCCLPP_DEVICE_INLINE void wait([[maybe_unused]] int64_t maxSpinCount = 100000000) {
|
||||
(*expectedInboundSemaphoreId) += 1;
|
||||
uint64_t flag = (*expectedInboundSemaphoreId);
|
||||
POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < flag), maxSpinCount);
|
||||
auto expected = incExpectedInbound();
|
||||
POLL_MAYBE_JAILBREAK((loadInbound() < expected), maxSpinCount);
|
||||
}
|
||||
|
||||
/// Wait for the remote device to signal.
|
||||
///
|
||||
/// This function is a relaxed version of Wait() and provides no guarantee on the completion of memory operations.
|
||||
/// User requires to call proper fencing before using this function.
|
||||
///
|
||||
/// Relaxed wait; no memory completion guarantee. Use it only for synchronizing execution, not data.
|
||||
MSCCLPP_DEVICE_INLINE void relaxedWait([[maybe_unused]] int64_t maxSpinCount = 100000000) {
|
||||
(*expectedInboundSemaphoreId) += 1;
|
||||
uint64_t flag = (*expectedInboundSemaphoreId);
|
||||
POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderRelaxed) < flag), maxSpinCount);
|
||||
auto expected = incExpectedInbound();
|
||||
POLL_MAYBE_JAILBREAK((loadInbound() < expected), maxSpinCount);
|
||||
}
|
||||
|
||||
/// Signal the remote device.
|
||||
///
|
||||
/// This function guarantees that all the memory operation before this function is completed before the remote
|
||||
/// semaphore is signaled.
|
||||
///
|
||||
/// Signal remote device, ensures prior memory ops complete.
|
||||
MSCCLPP_DEVICE_INLINE void signal() {
|
||||
// This fence ensures that preceding writes are visible on the peer GPU before the incremented
|
||||
// `outboundSemaphoreId` is visible.
|
||||
semaphoreIncrement();
|
||||
// use memoryOrderSeqCst instead of memoryOrderRelease since memoryOrderSeqCst
|
||||
// is more efficient on A100.
|
||||
#if __CUDA_ARCH__ == 800
|
||||
atomicStore(remoteInboundSemaphoreId, semaphoreGetLocal(), memoryOrderSeqCst);
|
||||
auto outbound = incOutbound();
|
||||
#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
|
||||
// Using memoryOrderSeqCst is faster for A100.
|
||||
atomicStore(remoteInboundSemaphoreId, outbound, memoryOrderSeqCst);
|
||||
#else
|
||||
atomicStore(remoteInboundSemaphoreId, semaphoreGetLocal(), memoryOrderRelease);
|
||||
atomicStore(remoteInboundSemaphoreId, outbound, memoryOrderRelease);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Signal the remote device.
|
||||
///
|
||||
/// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations.
|
||||
/// User requires to call proper fencing before using this function.
|
||||
///
|
||||
/// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
|
||||
MSCCLPP_DEVICE_INLINE void relaxedSignal() {
|
||||
// This fence ensures that preceding writes are visible on the peer GPU before the incremented
|
||||
// `outboundSemaphoreId` is visible.
|
||||
semaphoreIncrement();
|
||||
atomicStore(remoteInboundSemaphoreId, semaphoreGetLocal(), memoryOrderRelaxed);
|
||||
auto outbound = incOutbound();
|
||||
atomicStore(remoteInboundSemaphoreId, outbound, memoryOrderRelaxed);
|
||||
}
|
||||
|
||||
/// Increase the counter of the local semaphore.
|
||||
MSCCLPP_DEVICE_INLINE void semaphoreIncrement() { *outboundSemaphoreId += 1; }
|
||||
/// Thread-safe read of expected inbound value.
|
||||
/// @return The expected inbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t loadExpectedInbound() {
|
||||
return atomicLoad<uint64_t, scopeDevice>(expectedInboundSemaphoreId, memoryOrderRelaxed);
|
||||
}
|
||||
|
||||
/// Get the value of the local semaphore.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t semaphoreGetLocal() const { return *outboundSemaphoreId; }
|
||||
/// Thread-safe increment of expected inbound value.
|
||||
/// @return The incremented expected inbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t incExpectedInbound() {
|
||||
return atomicFetchAdd<uint64_t, scopeDevice>(expectedInboundSemaphoreId, 1, memoryOrderRelaxed) + 1;
|
||||
}
|
||||
|
||||
/// Thread-safe read of inbound value.
|
||||
/// @return The inbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t loadInbound() {
|
||||
return atomicLoad<uint64_t, scopeSystem>(inboundSemaphoreId, memoryOrderAcquire);
|
||||
}
|
||||
|
||||
/// Thread-safe read of outbound value.
|
||||
/// @return The outbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
|
||||
return atomicLoad<uint64_t, scopeDevice>(outboundSemaphoreId, memoryOrderRelaxed);
|
||||
}
|
||||
|
||||
/// Thread-safe increment of outbound value.
|
||||
/// @return The incremented outbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
|
||||
return atomicFetchAdd<uint64_t, scopeDevice>(outboundSemaphoreId, 1, memoryOrderRelaxed) + 1;
|
||||
}
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
|
||||
/// A local memory space where the remote device will write its semaphore value and the local device will read it.
|
||||
|
||||
@@ -148,7 +148,7 @@ void register_core(nb::module_& m) {
|
||||
.def_rw("ib_max_wr_per_send", &EndpointConfig::ibMaxWrPerSend);
|
||||
|
||||
nb::class_<Context>(m, "Context")
|
||||
.def(nb::init<>())
|
||||
.def_static("create", &Context::create)
|
||||
.def(
|
||||
"register_memory",
|
||||
[](Communicator* self, uintptr_t ptr, size_t size, TransportFlags transports) {
|
||||
|
||||
@@ -16,7 +16,7 @@ void register_port_channel(nb::module_& m) {
|
||||
.def("stop_proxy", &BaseProxyService::stopProxy);
|
||||
|
||||
nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
|
||||
.def(nb::init<size_t>(), nb::arg("fifoSize") = DEFAULT_FIFO_SIZE)
|
||||
.def(nb::init<int>(), nb::arg("fifoSize") = DEFAULT_FIFO_SIZE)
|
||||
.def("start_proxy", &ProxyService::startProxy)
|
||||
.def("stop_proxy", &ProxyService::stopProxy)
|
||||
.def("build_and_add_semaphore", &ProxyService::buildAndAddSemaphore, nb::arg("comm"), nb::arg("connection"))
|
||||
|
||||
@@ -36,18 +36,12 @@ class MyProxyService {
|
||||
connections_(conns),
|
||||
allRegMem_(allRegMem),
|
||||
semaphores_(semaphores),
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {
|
||||
int cudaDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice);
|
||||
}
|
||||
|
||||
void bindThread() {
|
||||
if (deviceNumaNode_ >= 0) {
|
||||
mscclpp::numaBind(deviceNumaNode_);
|
||||
}
|
||||
}
|
||||
|
||||
mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger) {
|
||||
int dataSizePerRank = dataSize_ / nranks_;
|
||||
for (int r = 1; r < nranks_; ++r) {
|
||||
@@ -64,7 +58,7 @@ class MyProxyService {
|
||||
|
||||
void stop() { proxy_.stop(); }
|
||||
|
||||
mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo().deviceHandle(); }
|
||||
mscclpp::FifoDeviceHandle fifoDeviceHandle() { return proxy_.fifo()->deviceHandle(); }
|
||||
};
|
||||
|
||||
void init_mscclpp_proxy_test_module(nb::module_ &m) {
|
||||
|
||||
@@ -11,7 +11,7 @@ namespace mscclpp {
|
||||
Communicator::Impl::Impl(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context)
|
||||
: bootstrap_(bootstrap) {
|
||||
if (!context) {
|
||||
context_ = std::make_shared<Context>();
|
||||
context_ = Context::create();
|
||||
} else {
|
||||
context_ = context;
|
||||
}
|
||||
|
||||
@@ -37,13 +37,13 @@ std::string Connection::getTransportName() const {
|
||||
TransportNames[static_cast<int>(this->remoteTransport())];
|
||||
}
|
||||
|
||||
int Connection::getMaxWriteQueueSize() const { return maxWriteQueueSize; }
|
||||
int Connection::getMaxWriteQueueSize() const { return maxWriteQueueSize_; }
|
||||
|
||||
// CudaIpcConnection
|
||||
|
||||
CudaIpcConnection::CudaIpcConnection(Endpoint localEndpoint, Endpoint remoteEndpoint,
|
||||
std::shared_ptr<CudaStreamWithFlags> stream)
|
||||
: Connection(localEndpoint.maxWriteQueueSize()), stream_(stream) {
|
||||
CudaIpcConnection::CudaIpcConnection(std::shared_ptr<Context> context, Endpoint localEndpoint, Endpoint remoteEndpoint,
|
||||
std::shared_ptr<CudaIpcStream> stream)
|
||||
: Connection(context, localEndpoint.maxWriteQueueSize()), stream_(stream) {
|
||||
if (localEndpoint.transport() != Transport::CudaIpc) {
|
||||
throw mscclpp::Error("Cuda IPC connection can only be made from a Cuda IPC endpoint", ErrorCode::InvalidUsage);
|
||||
}
|
||||
@@ -76,9 +76,8 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register
|
||||
char* dstPtr = (char*)dst.data();
|
||||
char* srcPtr = (char*)src.data();
|
||||
|
||||
if (!env()->cudaIpcUseDefaultStream && stream_->empty()) stream_->set(cudaStreamNonBlocking);
|
||||
stream_->memcpyD2D(dstPtr + dstOffset, srcPtr + srcOffset, size);
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, *stream_));
|
||||
INFO(MSCCLPP_P2P, "CudaIpcConnection write: from %p to %p, size %lu", srcPtr + srcOffset, dstPtr + dstOffset, size);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_CUDA_IPC_WRITE_EXIT)
|
||||
@@ -96,9 +95,8 @@ void CudaIpcConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset,
|
||||
*src = newValue;
|
||||
uint64_t* dstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(dst.data()) + dstOffset);
|
||||
|
||||
if (!env()->cudaIpcUseDefaultStream && stream_->empty()) stream_->set(cudaStreamNonBlocking);
|
||||
stream_->memcpyH2D(dstPtr + dstOffset, src, sizeof(uint64_t));
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr, src, sizeof(uint64_t), cudaMemcpyHostToDevice, *stream_));
|
||||
INFO(MSCCLPP_P2P, "CudaIpcConnection atomic write: from %p to %p, %lu -> %lu", src, dstPtr + dstOffset, oldValue,
|
||||
newValue);
|
||||
|
||||
@@ -116,10 +114,8 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
|
||||
INFO(MSCCLPP_P2P, "CudaIpcConnection flush: timeout is not supported, ignored");
|
||||
}
|
||||
|
||||
if (!env()->cudaIpcUseDefaultStream && stream_->empty()) stream_->set(cudaStreamNonBlocking);
|
||||
stream_->sync();
|
||||
|
||||
AvoidCudaGraphCaptureGuard guard;
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(*stream_));
|
||||
INFO(MSCCLPP_P2P, "CudaIpcConnection flushing connection");
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_CUDA_IPC_FLUSH_EXIT)
|
||||
@@ -129,16 +125,16 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
|
||||
|
||||
// IBConnection
|
||||
|
||||
IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)
|
||||
: Connection(localEndpoint.maxWriteQueueSize() != -1 ? localEndpoint.maxWriteQueueSize()
|
||||
: EndpointConfig::DefaultMaxCqSize),
|
||||
IBConnection::IBConnection(std::shared_ptr<Context> context, Endpoint localEndpoint, Endpoint remoteEndpoint)
|
||||
: Connection(context, localEndpoint.maxWriteQueueSize() != -1 ? localEndpoint.maxWriteQueueSize()
|
||||
: EndpointConfig::DefaultMaxCqSize),
|
||||
transport_(localEndpoint.transport()),
|
||||
remoteTransport_(remoteEndpoint.transport()),
|
||||
dummyAtomicSource_(std::make_unique<uint64_t>(0)) {
|
||||
qp = getImpl(localEndpoint)->ibQp_;
|
||||
qp->rtr(getImpl(remoteEndpoint)->ibQpInfo_);
|
||||
qp->rts();
|
||||
dummyAtomicSourceMem_ = context.registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
|
||||
qp_ = getImpl(localEndpoint)->ibQp_;
|
||||
qp_->rtr(getImpl(remoteEndpoint)->ibQpInfo_);
|
||||
qp_->rts();
|
||||
dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
|
||||
validateTransport(dummyAtomicSourceMem_, transport_);
|
||||
dstTransportInfo_ = getImpl(dummyAtomicSourceMem_)->getTransportInfo(transport_);
|
||||
INFO(MSCCLPP_NET, "IB connection via %s created", getIBDeviceName(transport_).c_str());
|
||||
@@ -169,10 +165,10 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem
|
||||
auto dstMrInfo = dstTransportInfo.ibMrInfo;
|
||||
auto srcMr = srcTransportInfo.ibMr;
|
||||
|
||||
qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset,
|
||||
/*signaled=*/true);
|
||||
qp_->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset,
|
||||
/*signaled=*/true);
|
||||
|
||||
qp->postSend();
|
||||
qp_->postSend();
|
||||
INFO(MSCCLPP_NET, "IBConnection write: from %p to %p, size %lu", (uint8_t*)srcMr->getBuff() + srcOffset,
|
||||
(uint8_t*)dstMrInfo.addr + dstOffset, size);
|
||||
|
||||
@@ -197,9 +193,9 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
|
||||
uint64_t oldValue = *src;
|
||||
*src = newValue;
|
||||
|
||||
qp->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, /*signaled=*/true);
|
||||
qp_->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, /*signaled=*/true);
|
||||
|
||||
qp->postSend();
|
||||
qp_->postSend();
|
||||
INFO(MSCCLPP_NET, "IBConnection atomic Write: from %p to %p, %lu -> %lu", src, (uint8_t*)dstMrInfo.addr + dstOffset,
|
||||
oldValue, newValue);
|
||||
|
||||
@@ -214,20 +210,20 @@ void IBConnection::flush(int64_t timeoutUsec) {
|
||||
#endif
|
||||
|
||||
Timer timer;
|
||||
while (qp->getNumCqItems()) {
|
||||
int wcNum = qp->pollCq();
|
||||
while (qp_->getNumCqItems()) {
|
||||
int wcNum = qp_->pollCq();
|
||||
if (wcNum < 0) {
|
||||
throw mscclpp::IbError("pollCq failed: error no " + std::to_string(errno), errno);
|
||||
} else if (timeoutUsec >= 0) {
|
||||
auto elapsed = timer.elapsed();
|
||||
if (elapsed > timeoutUsec) {
|
||||
throw Error("pollCq timed out: waited for " + std::to_string(elapsed / 1e6) + " seconds. Expected " +
|
||||
std::to_string(qp->getNumCqItems()) + " signals",
|
||||
std::to_string(qp_->getNumCqItems()) + " signals",
|
||||
ErrorCode::Timeout);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < wcNum; ++i) {
|
||||
int status = qp->getWcStatus(i);
|
||||
int status = qp_->getWcStatus(i);
|
||||
if (status != static_cast<int>(WsStatus::Success)) {
|
||||
throw mscclpp::IbError("a work item failed: status " + std::to_string(status), status);
|
||||
}
|
||||
@@ -242,9 +238,9 @@ void IBConnection::flush(int64_t timeoutUsec) {
|
||||
|
||||
// EthernetConnection
|
||||
|
||||
EthernetConnection::EthernetConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, uint64_t sendBufferSize,
|
||||
uint64_t recvBufferSize)
|
||||
: Connection(localEndpoint.maxWriteQueueSize()),
|
||||
EthernetConnection::EthernetConnection(std::shared_ptr<Context> context, Endpoint localEndpoint,
|
||||
Endpoint remoteEndpoint, uint64_t sendBufferSize, uint64_t recvBufferSize)
|
||||
: Connection(context, localEndpoint.maxWriteQueueSize()),
|
||||
abortFlag_(0),
|
||||
sendBufferSize_(sendBufferSize),
|
||||
recvBufferSize_(recvBufferSize) {
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#include "context.hpp"
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
|
||||
#include "api.h"
|
||||
#include "connection.hpp"
|
||||
#include "debug.h"
|
||||
@@ -11,9 +13,35 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
CudaIpcStream::CudaIpcStream() : stream_(std::make_shared<CudaStreamWithFlags>()), dirty_(false) {}
|
||||
|
||||
void CudaIpcStream::setStreamIfNeeded() {
|
||||
if (!env()->cudaIpcUseDefaultStream && stream_->empty()) stream_->set(cudaStreamNonBlocking);
|
||||
}
|
||||
|
||||
void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) {
|
||||
setStreamIfNeeded();
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));
|
||||
dirty_ = true;
|
||||
}
|
||||
|
||||
void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) {
|
||||
setStreamIfNeeded();
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));
|
||||
dirty_ = true;
|
||||
}
|
||||
|
||||
void CudaIpcStream::sync() {
|
||||
setStreamIfNeeded();
|
||||
if (dirty_) {
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(*stream_));
|
||||
dirty_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
Context::Impl::Impl() {}
|
||||
|
||||
IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
|
||||
IbCtx *Context::Impl::getIbContext(Transport ibTransport) {
|
||||
// Find IB context or create it
|
||||
auto it = ibContexts_.find(ibTransport);
|
||||
if (it == ibContexts_.end()) {
|
||||
@@ -29,7 +57,7 @@ MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}
|
||||
|
||||
MSCCLPP_API_CPP Context::~Context() = default;
|
||||
|
||||
MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) {
|
||||
MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void *ptr, size_t size, TransportFlags transports) {
|
||||
return RegisteredMemory(std::make_shared<RegisteredMemory::Impl>(ptr, size, transports, *pimpl_));
|
||||
}
|
||||
|
||||
@@ -43,24 +71,25 @@ MSCCLPP_API_CPP std::shared_ptr<Connection> Context::connect(Endpoint localEndpo
|
||||
if (remoteEndpoint.transport() != Transport::CudaIpc) {
|
||||
throw mscclpp::Error("Local transport is CudaIpc but remote is not", ErrorCode::InvalidUsage);
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
pimpl_->ipcStreams_.emplace_back(std::make_shared<CudaStreamWithFlags>());
|
||||
#else
|
||||
#if defined(MSCCLPP_DEVICE_HIP)
|
||||
pimpl_->ipcStreams_.emplace_back(std::make_shared<CudaIpcStream>());
|
||||
#else // !defined(MSCCLPP_DEVICE_HIP)
|
||||
if (pimpl_->ipcStreams_.empty()) {
|
||||
pimpl_->ipcStreams_.emplace_back(std::make_shared<CudaStreamWithFlags>());
|
||||
pimpl_->ipcStreams_.emplace_back(std::make_shared<CudaIpcStream>());
|
||||
}
|
||||
#endif
|
||||
conn = std::make_shared<CudaIpcConnection>(localEndpoint, remoteEndpoint, pimpl_->ipcStreams_.back());
|
||||
#endif // !defined(MSCCLPP_DEVICE_HIP)
|
||||
conn = std::make_shared<CudaIpcConnection>(shared_from_this(), localEndpoint, remoteEndpoint,
|
||||
pimpl_->ipcStreams_.back());
|
||||
} else if (AllIBTransports.has(localEndpoint.transport())) {
|
||||
if (!AllIBTransports.has(remoteEndpoint.transport())) {
|
||||
throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage);
|
||||
}
|
||||
conn = std::make_shared<IBConnection>(localEndpoint, remoteEndpoint, *this);
|
||||
conn = std::make_shared<IBConnection>(shared_from_this(), localEndpoint, remoteEndpoint);
|
||||
} else if (localEndpoint.transport() == Transport::Ethernet) {
|
||||
if (remoteEndpoint.transport() != Transport::Ethernet) {
|
||||
throw mscclpp::Error("Local transport is Ethernet but remote is not", ErrorCode::InvalidUsage);
|
||||
}
|
||||
conn = std::make_shared<EthernetConnection>(localEndpoint, remoteEndpoint);
|
||||
conn = std::make_shared<EthernetConnection>(shared_from_this(), localEndpoint, remoteEndpoint);
|
||||
} else {
|
||||
throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError);
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ Env::Env()
|
||||
enableNcclFallback(readEnv<bool>("MSCCLPP_ENABLE_NCCL_FALLBACK", false)),
|
||||
disableChannelCache(readEnv<bool>("MSCCLPP_DISABLE_CHANNEL_CACHE", false)),
|
||||
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
|
||||
fifoUseTailReplica(readEnv<bool>("MSCCLPP_FIFO_USE_TAIL_REPLICA", false)) {}
|
||||
fifoUseTailReplica(readEnv<bool>("MSCCLPP_FIFO_USE_TAIL_REPLICA", true)) {}
|
||||
|
||||
std::shared_ptr<Env> env() {
|
||||
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
|
||||
|
||||
57
src/fifo.cc
57
src/fifo.cc
@@ -4,6 +4,7 @@
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/fifo.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/numa.hpp>
|
||||
|
||||
#include "api.h"
|
||||
#include "atomic.hpp"
|
||||
@@ -13,31 +14,40 @@ namespace mscclpp {
|
||||
struct Fifo::Impl {
|
||||
detail::UniqueGpuHostPtr<ProxyTrigger> triggers;
|
||||
detail::UniqueGpuPtr<uint64_t> head;
|
||||
std::shared_ptr<uint64_t> tailHost;
|
||||
detail::UniqueGpuPtr<uint64_t> tailReplica;
|
||||
const int size;
|
||||
|
||||
// The original tail of this fifo allocated on the host. If a tail replica is used
|
||||
// (when `env()->fifoUseTailReplica == true`), it always holds that *tailReplica <= *hostTail.
|
||||
std::shared_ptr<uint64_t> hostTail;
|
||||
|
||||
// for transferring fifo tail
|
||||
CudaStreamWithFlags stream;
|
||||
|
||||
Impl(int size)
|
||||
: triggers(detail::gpuCallocHostUnique<ProxyTrigger>(size)),
|
||||
head(detail::gpuCallocUnique<uint64_t>()),
|
||||
tailHost(env()->fifoUseTailReplica ? std::make_shared<uint64_t>(0) : detail::gpuCallocHostShared<uint64_t>()),
|
||||
tailReplica(env()->fifoUseTailReplica ? detail::gpuCallocUnique<uint64_t>() : nullptr),
|
||||
size(size),
|
||||
hostTail(env()->fifoUseTailReplica ? std::make_shared<uint64_t>(0) : detail::gpuCallocHostShared<uint64_t>()),
|
||||
stream(cudaStreamNonBlocking) {}
|
||||
size(size) {
|
||||
if (env()->fifoUseTailReplica) {
|
||||
stream.set(cudaStreamNonBlocking);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
MSCCLPP_API_CPP Fifo::Fifo(int size) : pimpl(std::make_unique<Impl>(size)) {}
|
||||
MSCCLPP_API_CPP Fifo::Fifo(int size) {
|
||||
int device;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&device));
|
||||
int numaNode = getDeviceNumaNode(device);
|
||||
if (numaNode >= 0) {
|
||||
numaBind(numaNode);
|
||||
}
|
||||
pimpl_ = std::make_unique<Impl>(size);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Fifo::~Fifo() = default;
|
||||
|
||||
MSCCLPP_API_CPP ProxyTrigger Fifo::poll() {
|
||||
ProxyTrigger trigger;
|
||||
ProxyTrigger* ptr = &pimpl->triggers.get()[*(pimpl->hostTail) % pimpl->size];
|
||||
ProxyTrigger* ptr = &pimpl_->triggers.get()[*(pimpl_->tailHost) % pimpl_->size];
|
||||
// we are loading fst first. if fst is non-zero then snd is also valid
|
||||
trigger.fst = atomicLoad(&(ptr->fst), memoryOrderAcquire);
|
||||
trigger.snd = ptr->snd;
|
||||
@@ -45,39 +55,34 @@ MSCCLPP_API_CPP ProxyTrigger Fifo::poll() {
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP void Fifo::pop() {
|
||||
uint64_t curTail = *(pimpl->hostTail);
|
||||
atomicStore(&(pimpl->triggers.get()[curTail % pimpl->size].fst), uint64_t{0}, memoryOrderRelease);
|
||||
*(pimpl->hostTail) = curTail + 1;
|
||||
uint64_t curTail = *(pimpl_->tailHost);
|
||||
pimpl_->triggers.get()[curTail % pimpl_->size].fst = 0;
|
||||
*(pimpl_->tailHost) = curTail + 1;
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP void Fifo::flushTail(bool sync) {
|
||||
MSCCLPP_API_CPP void Fifo::flushTail([[maybe_unused]] bool sync) {
|
||||
if (!env()->fifoUseTailReplica) {
|
||||
// Nothing to flush if the tail is not replicated.
|
||||
return;
|
||||
}
|
||||
#if defined(MSCCLPP_DEVICE_HIP)
|
||||
*(pimpl->tailReplica.get()) = *(pimpl->hostTail.get());
|
||||
#else // !defined(MSCCLPP_DEVICE_HIP)
|
||||
// Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure that the fifo can
|
||||
// make progress even if there is no request mscclppSync. However, mscclppSync type is for flush request.
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), pimpl->hostTail.get(), sizeof(uint64_t),
|
||||
cudaMemcpyHostToDevice, pimpl->stream));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl_->tailReplica.get(), pimpl_->tailHost.get(), sizeof(uint64_t),
|
||||
cudaMemcpyHostToDevice, pimpl_->stream));
|
||||
if (sync) {
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(pimpl->stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(pimpl_->stream));
|
||||
}
|
||||
#endif // !defined(MSCCLPP_DEVICE_HIP)
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP int Fifo::size() const { return pimpl->size; }
|
||||
MSCCLPP_API_CPP int Fifo::size() const { return pimpl_->size; }
|
||||
|
||||
MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() const {
|
||||
FifoDeviceHandle deviceHandle;
|
||||
deviceHandle.triggers = pimpl->triggers.get();
|
||||
deviceHandle.head = pimpl->head.get();
|
||||
deviceHandle.triggers = pimpl_->triggers.get();
|
||||
deviceHandle.head = pimpl_->head.get();
|
||||
// tailReplica refers to the original tail if `fifoUseTailReplica == false`.
|
||||
deviceHandle.tailReplica = env()->fifoUseTailReplica ? pimpl->tailReplica.get() : pimpl->hostTail.get();
|
||||
deviceHandle.size = pimpl->size;
|
||||
deviceHandle.tailReplica = env()->fifoUseTailReplica ? pimpl_->tailReplica.get() : pimpl_->tailHost.get();
|
||||
deviceHandle.size = pimpl_->size;
|
||||
return deviceHandle;
|
||||
}
|
||||
|
||||
|
||||
@@ -83,10 +83,10 @@ void* gpuCalloc(size_t bytes) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void* gpuCallocHost(size_t bytes) {
|
||||
void* gpuCallocHost(size_t bytes, unsigned int flags) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
void* ptr;
|
||||
MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, cudaHostAllocMapped | cudaHostAllocWriteCombined));
|
||||
MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, flags));
|
||||
::memset(ptr, 0, bytes);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -16,10 +16,12 @@
|
||||
namespace mscclpp {
|
||||
|
||||
class CudaIpcConnection : public Connection {
|
||||
std::shared_ptr<CudaStreamWithFlags> stream_;
|
||||
private:
|
||||
std::shared_ptr<CudaIpcStream> stream_;
|
||||
|
||||
public:
|
||||
CudaIpcConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, std::shared_ptr<CudaStreamWithFlags> stream);
|
||||
CudaIpcConnection(std::shared_ptr<Context> context, Endpoint localEndpoint, Endpoint remoteEndpoint,
|
||||
std::shared_ptr<CudaIpcStream> stream);
|
||||
|
||||
Transport transport() const override;
|
||||
|
||||
@@ -33,15 +35,16 @@ class CudaIpcConnection : public Connection {
|
||||
};
|
||||
|
||||
class IBConnection : public Connection {
|
||||
private:
|
||||
Transport transport_;
|
||||
Transport remoteTransport_;
|
||||
IbQp* qp;
|
||||
IbQp* qp_;
|
||||
std::unique_ptr<uint64_t> dummyAtomicSource_; // not used anywhere but IB needs a source
|
||||
RegisteredMemory dummyAtomicSourceMem_;
|
||||
mscclpp::TransportInfo dstTransportInfo_;
|
||||
|
||||
public:
|
||||
IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context);
|
||||
IBConnection(std::shared_ptr<Context> context, Endpoint localEndpoint, Endpoint remoteEndpoint);
|
||||
|
||||
Transport transport() const override;
|
||||
|
||||
@@ -55,6 +58,7 @@ class IBConnection : public Connection {
|
||||
};
|
||||
|
||||
class EthernetConnection : public Connection {
|
||||
private:
|
||||
std::unique_ptr<Socket> sendSocket_;
|
||||
std::unique_ptr<Socket> recvSocket_;
|
||||
std::thread threadRecvMessages_;
|
||||
@@ -64,9 +68,12 @@ class EthernetConnection : public Connection {
|
||||
std::vector<char> sendBuffer_;
|
||||
std::vector<char> recvBuffer_;
|
||||
|
||||
void recvMessages();
|
||||
void sendMessage();
|
||||
|
||||
public:
|
||||
EthernetConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, uint64_t sendBufferSize = 256 * 1024 * 1024,
|
||||
uint64_t recvBufferSize = 256 * 1024 * 1024);
|
||||
EthernetConnection(std::shared_ptr<Context> context, Endpoint localEndpoint, Endpoint remoteEndpoint,
|
||||
uint64_t sendBufferSize = 256 * 1024 * 1024, uint64_t recvBufferSize = 256 * 1024 * 1024);
|
||||
|
||||
~EthernetConnection();
|
||||
|
||||
@@ -79,11 +86,6 @@ class EthernetConnection : public Connection {
|
||||
void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override;
|
||||
|
||||
void flush(int64_t timeoutUsec) override;
|
||||
|
||||
private:
|
||||
void recvMessages();
|
||||
|
||||
void sendMessage();
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -13,15 +13,34 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
class CudaIpcStream {
|
||||
private:
|
||||
std::shared_ptr<CudaStreamWithFlags> stream_;
|
||||
bool dirty_;
|
||||
|
||||
void setStreamIfNeeded();
|
||||
|
||||
public:
|
||||
CudaIpcStream();
|
||||
|
||||
void memcpyD2D(void *dst, const void *src, size_t nbytes);
|
||||
|
||||
void memcpyH2D(void *dst, const void *src, size_t nbytes);
|
||||
|
||||
void sync();
|
||||
|
||||
operator cudaStream_t() const { return *stream_; }
|
||||
};
|
||||
|
||||
struct Context::Impl {
|
||||
std::vector<std::shared_ptr<Connection>> connections_;
|
||||
std::unordered_map<Transport, std::unique_ptr<IbCtx>> ibContexts_;
|
||||
std::vector<std::shared_ptr<CudaStreamWithFlags>> ipcStreams_;
|
||||
std::vector<std::shared_ptr<CudaIpcStream>> ipcStreams_;
|
||||
CUmemGenericAllocationHandle mcHandle_;
|
||||
|
||||
Impl();
|
||||
|
||||
IbCtx* getIbContext(Transport ibTransport);
|
||||
IbCtx *getIbContext(Transport ibTransport);
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -18,12 +18,19 @@ MSCCLPP_API_CPP PortChannel::PortChannel(SemaphoreId semaphoreId, std::shared_pt
|
||||
std::shared_ptr<Proxy> proxy, MemoryId dst, MemoryId src)
|
||||
: BasePortChannel(semaphoreId, semaphore, proxy), dst_(dst), src_(src) {}
|
||||
|
||||
MSCCLPP_API_CPP ProxyService::ProxyService(size_t fifoSize)
|
||||
: proxy_(std::make_shared<Proxy>([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); },
|
||||
[&]() { bindThread(); }, fifoSize)) {
|
||||
MSCCLPP_API_CPP ProxyService::ProxyService(int fifoSize) {
|
||||
int cudaDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
deviceNumaNode = getDeviceNumaNode(cudaDevice);
|
||||
int deviceNumaNode = getDeviceNumaNode(cudaDevice);
|
||||
auto initFunc = [cudaDevice, deviceNumaNode]() {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevice));
|
||||
if (deviceNumaNode >= 0) {
|
||||
numaBind(deviceNumaNode);
|
||||
INFO(MSCCLPP_INIT, "NUMA node of ProxyService proxy thread is set to %d", deviceNumaNode);
|
||||
}
|
||||
};
|
||||
auto handlerFunc = [&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); };
|
||||
proxy_ = std::make_shared<Proxy>(handlerFunc, initFunc, fifoSize);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP SemaphoreId ProxyService::buildAndAddSemaphore(Communicator& communicator,
|
||||
@@ -58,49 +65,44 @@ MSCCLPP_API_CPP void ProxyService::startProxy() { proxy_->start(); }
|
||||
|
||||
MSCCLPP_API_CPP void ProxyService::stopProxy() { proxy_->stop(); }
|
||||
|
||||
MSCCLPP_API_CPP void ProxyService::bindThread() {
|
||||
if (deviceNumaNode >= 0) {
|
||||
numaBind(deviceNumaNode);
|
||||
INFO(MSCCLPP_INIT, "NUMA node of ProxyService proxy thread is set to %d", deviceNumaNode);
|
||||
}
|
||||
}
|
||||
|
||||
ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger triggerRaw) {
|
||||
ChannelTrigger* trigger = reinterpret_cast<ChannelTrigger*>(&triggerRaw);
|
||||
std::shared_ptr<Host2DeviceSemaphore> semaphore = semaphores_[trigger->fields.semaphoreId];
|
||||
|
||||
auto result = ProxyHandlerResult::Continue;
|
||||
int maxWriteQueueSize = semaphore->connection()->getMaxWriteQueueSize();
|
||||
auto& numRequests = inflightRequests_[semaphore->connection()];
|
||||
|
||||
if (trigger->fields.type & TriggerData) {
|
||||
RegisteredMemory& dst = memories_[trigger->fields.dstMemoryId];
|
||||
RegisteredMemory& src = memories_[trigger->fields.srcMemoryId];
|
||||
semaphore->connection()->write(dst, trigger->fields.dstOffset, src, trigger->fields.srcOffset,
|
||||
trigger->fields.size);
|
||||
inflightRequests[semaphore->connection()]++;
|
||||
numRequests++;
|
||||
}
|
||||
|
||||
if (trigger->fields.type & TriggerFlag) {
|
||||
semaphore->signal();
|
||||
inflightRequests[semaphore->connection()]++;
|
||||
numRequests++;
|
||||
}
|
||||
|
||||
if (trigger->fields.type & TriggerSync ||
|
||||
(maxWriteQueueSize != -1 && inflightRequests[semaphore->connection()] > maxWriteQueueSize)) {
|
||||
if (((trigger->fields.type & TriggerSync) && numRequests > 0) ||
|
||||
(maxWriteQueueSize != -1 && numRequests > maxWriteQueueSize)) {
|
||||
semaphore->connection()->flush();
|
||||
result = ProxyHandlerResult::FlushFifoTailAndContinue;
|
||||
inflightRequests[semaphore->connection()] = 0;
|
||||
numRequests = 0;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP BasePortChannel::DeviceHandle BasePortChannel::deviceHandle() const {
|
||||
return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle());
|
||||
return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle());
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP PortChannel::DeviceHandle PortChannel::deviceHandle() const {
|
||||
return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle(), dst_, src_);
|
||||
return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), dst_,
|
||||
src_);
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
78
src/proxy.cc
78
src/proxy.cc
@@ -4,6 +4,7 @@
|
||||
#include <atomic>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/numa.hpp>
|
||||
#include <mscclpp/proxy.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <thread>
|
||||
@@ -12,53 +13,61 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
const int ProxyStopCheckPeriod = 1000;
|
||||
constexpr int ProxyStopCheckPeriod = 1000;
|
||||
|
||||
// Unless explicitly requested, a flush of the tail to device memory is triggered for every ProxyFlushPeriod.
|
||||
// As long as the FIFO size is large enough, having a stale tail is not a problem.
|
||||
const int ProxyFlushPeriod = 4;
|
||||
constexpr int ProxyFlushPeriod = 4;
|
||||
|
||||
struct Proxy::Impl {
|
||||
ProxyHandler handler;
|
||||
std::function<void()> threadInit;
|
||||
Fifo fifo;
|
||||
std::shared_ptr<Fifo> fifo;
|
||||
std::thread service;
|
||||
std::atomic_bool running;
|
||||
|
||||
Impl(ProxyHandler handler, std::function<void()> threadInit, size_t fifoSize)
|
||||
: handler(handler), threadInit(threadInit), fifo(fifoSize), running(false) {}
|
||||
Impl(ProxyHandler handler, std::function<void()> threadInit, int fifoSize)
|
||||
: handler(handler), threadInit(threadInit), fifo(std::make_shared<Fifo>(fifoSize)), running(false) {}
|
||||
};
|
||||
|
||||
MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function<void()> threadInit, size_t fifoSize) {
|
||||
pimpl = std::make_unique<Impl>(handler, threadInit, fifoSize);
|
||||
MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize) {
|
||||
pimpl_ = std::make_unique<Impl>(handler, threadInit, fifoSize);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, size_t fifoSize)
|
||||
: Proxy(
|
||||
handler, [] {}, fifoSize) {}
|
||||
MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, int fifoSize) {
|
||||
int cudaDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
int deviceNumaNode = getDeviceNumaNode(cudaDevice);
|
||||
auto initFunc = [cudaDevice, deviceNumaNode]() {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevice));
|
||||
if (deviceNumaNode >= 0) {
|
||||
numaBind(deviceNumaNode);
|
||||
}
|
||||
};
|
||||
pimpl_ = std::make_unique<Impl>(handler, initFunc, fifoSize);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Proxy::~Proxy() {
|
||||
if (pimpl) {
|
||||
if (pimpl_) {
|
||||
stop();
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP void Proxy::start() {
|
||||
int cudaDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
pimpl_->running = true;
|
||||
pimpl_->service = std::thread([this] {
|
||||
// never capture in a proxy thread
|
||||
auto mode = cudaStreamCaptureModeRelaxed;
|
||||
MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
|
||||
pimpl->running = true;
|
||||
pimpl->service = std::thread([this, cudaDevice] {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevice));
|
||||
pimpl_->threadInit();
|
||||
|
||||
pimpl->threadInit();
|
||||
|
||||
ProxyHandler handler = this->pimpl->handler;
|
||||
Fifo& fifo = this->pimpl->fifo;
|
||||
std::atomic_bool& running = this->pimpl->running;
|
||||
ProxyHandler handler = this->pimpl_->handler;
|
||||
auto fifo = this->pimpl_->fifo;
|
||||
std::atomic_bool& running = this->pimpl_->running;
|
||||
ProxyTrigger trigger;
|
||||
|
||||
int flushPeriod = std::min(fifo.size(), ProxyFlushPeriod);
|
||||
int flushPeriod = std::min(fifo->size(), ProxyFlushPeriod);
|
||||
|
||||
int runCnt = ProxyStopCheckPeriod;
|
||||
uint64_t flushCnt = 0;
|
||||
@@ -70,21 +79,20 @@ MSCCLPP_API_CPP void Proxy::start() {
|
||||
}
|
||||
}
|
||||
// Poll to see if we are ready to send anything
|
||||
trigger = fifo.poll();
|
||||
trigger = fifo->poll();
|
||||
if (trigger.fst == 0 || trigger.snd == 0) { // TODO: this check is a potential pitfall for custom triggers
|
||||
continue; // there is one in progress
|
||||
}
|
||||
trigger.snd ^= ((uint64_t)1 << (uint64_t)63); // this is where the last bit of snd is reverted.
|
||||
trigger.snd ^= (uint64_t{1} << uint64_t{63}); // this is where the last bit of snd is reverted.
|
||||
|
||||
ProxyHandlerResult result = handler(trigger);
|
||||
|
||||
// Send completion: reset only the high 64 bits
|
||||
fifo.pop();
|
||||
fifo->pop();
|
||||
// Flush the tail to device memory. This is either triggered every flushPeriod to make sure that the fifo can make
|
||||
// progress even if there is no request mscclppSync. However, mscclppSync type is for flush request.
|
||||
if ((++flushCnt % flushPeriod) == 0 || result == ProxyHandlerResult::FlushFifoTailAndContinue) {
|
||||
// TODO: relocate this check: || (trigger.fields.type & mscclppSync)
|
||||
fifo.flushTail();
|
||||
fifo->flushTail();
|
||||
}
|
||||
|
||||
if (result == ProxyHandlerResult::Stop) {
|
||||
@@ -93,23 +101,17 @@ MSCCLPP_API_CPP void Proxy::start() {
|
||||
}
|
||||
|
||||
// make sure the tail is flushed before we shut the proxy
|
||||
fifo.flushTail(/*sync=*/true);
|
||||
// TODO: do these need to run?
|
||||
// bool isP2pProxy = (proxyState->ibContext == nullptr);
|
||||
// if (isP2pProxy) {
|
||||
// cudaStream_t p2pStream = proxyState->p2pStream;
|
||||
// PROXYCUDACHECK(cudaStreamSynchronize(p2pStream));
|
||||
// }
|
||||
fifo->flushTail(/*sync=*/true);
|
||||
});
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP void Proxy::stop() {
|
||||
pimpl->running = false;
|
||||
if (pimpl->service.joinable()) {
|
||||
pimpl->service.join();
|
||||
pimpl_->running = false;
|
||||
if (pimpl_->service.joinable()) {
|
||||
pimpl_->service.join();
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Fifo& Proxy::fifo() { return pimpl->fifo; }
|
||||
MSCCLPP_API_CPP std::shared_ptr<Fifo> Proxy::fifo() { return pimpl_->fifo; }
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -29,7 +29,7 @@ static detail::UniqueGpuPtr<uint64_t> createGpuSemaphoreId() {
|
||||
|
||||
MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator,
|
||||
std::shared_ptr<Connection> connection)
|
||||
: BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), std::make_unique<uint64_t>()),
|
||||
: BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), detail::gpuCallocHostUnique<uint64_t>()),
|
||||
connection_(connection) {
|
||||
INFO(MSCCLPP_INIT, "Creating a Host2Device semaphore for %s transport from %d to %d",
|
||||
connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
|
||||
|
||||
@@ -88,18 +88,13 @@ class MyProxyService {
|
||||
std::vector<std::shared_ptr<mscclpp::Host2DeviceSemaphore>> deviceSemaphores2_;
|
||||
std::vector<std::shared_ptr<mscclpp::Connection>> connections_;
|
||||
mscclpp::Proxy proxy_;
|
||||
int deviceNumaNode_;
|
||||
|
||||
public:
|
||||
MyProxyService(mscclpp::Communicator& comm, int* data_d, int dataSize)
|
||||
: dataSize_(dataSize),
|
||||
remoteMemories_(world_size),
|
||||
connections_(world_size),
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
|
||||
int cudaDevice;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice);
|
||||
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {
|
||||
int thisNode = rankToNode(rank);
|
||||
int cudaNum = rankToLocalRank(rank);
|
||||
std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum);
|
||||
@@ -144,12 +139,6 @@ class MyProxyService {
|
||||
}
|
||||
}
|
||||
|
||||
void bindThread() {
|
||||
if (deviceNumaNode_ >= 0) {
|
||||
mscclpp::numaBind(deviceNumaNode_);
|
||||
}
|
||||
}
|
||||
|
||||
mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger triggerRaw) {
|
||||
static int flusher = 0;
|
||||
if (triggerRaw.fst > 0) {
|
||||
@@ -176,7 +165,7 @@ class MyProxyService {
|
||||
|
||||
void stop() { proxy_.stop(); }
|
||||
|
||||
mscclpp::Fifo& fifo() { return proxy_.fifo(); }
|
||||
std::shared_ptr<mscclpp::Fifo> fifo() { return proxy_.fifo(); }
|
||||
|
||||
mscclpp::Host2DeviceSemaphore::DeviceHandle getDeviceHandle1(int r) { return deviceSemaphores1_[r]->deviceHandle(); }
|
||||
|
||||
@@ -249,7 +238,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
if (rank == 0) printf("Launching MSCCL++ proxy threads\n");
|
||||
proxyService.start();
|
||||
mscclpp::FifoDeviceHandle fifo = proxyService.fifo().deviceHandle();
|
||||
mscclpp::FifoDeviceHandle fifo = proxyService.fifo()->deviceHandle();
|
||||
if (rank == 0) printf("Testing the correctness of AllGather implementation\n");
|
||||
cudaStream_t stream;
|
||||
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
|
||||
@@ -536,16 +536,11 @@ class AllGatherProxyService : public mscclpp::BaseProxyService {
|
||||
};
|
||||
|
||||
AllGatherProxyService::AllGatherProxyService(int worldSize, int rank, int cudaDevice)
|
||||
: worldSize_(worldSize),
|
||||
rank_(rank),
|
||||
cudaDevice_(cudaDevice),
|
||||
sendBytes_(0),
|
||||
proxy_(
|
||||
std::make_shared<mscclpp::Proxy>([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); },
|
||||
[&]() {
|
||||
int deviceNumaNode = getDeviceNumaNode(cudaDevice_);
|
||||
numaBind(deviceNumaNode);
|
||||
})) {}
|
||||
: worldSize_(worldSize), rank_(rank), cudaDevice_(cudaDevice), sendBytes_(0) {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevice));
|
||||
auto handlerFunc = [&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); };
|
||||
proxy_ = std::make_shared<mscclpp::Proxy>(handlerFunc);
|
||||
}
|
||||
|
||||
mscclpp::ProxyHandlerResult AllGatherProxyService::handleTrigger(mscclpp::ProxyTrigger triggerRaw) {
|
||||
size_t offset = rank_ * sendBytes_;
|
||||
|
||||
@@ -275,6 +275,7 @@ void BaseTestEngine::runTest() {
|
||||
if (args_.reportErrors) {
|
||||
this->coll_->setupCollTest(args_, size);
|
||||
this->coll_->initData(this->args_, this->getSendBuff(), this->getExpectedBuff());
|
||||
CUDATHROW(cudaDeviceSynchronize());
|
||||
this->barrier();
|
||||
this->coll_->runColl(args_, stream_);
|
||||
CUDATHROW(cudaDeviceSynchronize());
|
||||
|
||||
Reference in New Issue
Block a user