Refactor algo selection logic and introduce symmetric_memory env (#741)

This PR refactors the algorithm selection logic in MSCCL++ and
introduces support for symmetric memory configuration through
environment variables.


1. Algorithm Selection Refactoring
Use separate class for algo selection. Could introduce more complex
logic for algo selection based on message size, arch, if cuda graph is
enabled and memory allocation method

2. Symmetric Memory Support
Introduced symmetricMemory parameter in algorithm context key
generation. Remove disableChannelCache env as is ambiguous

3. Add new args for build_default_algorithms 
Add flag_buffer, and flag_buffer_size args to build default algorithm.
Then we could use unified flag buffer for different algorithms, avoid
application hanging when switch algo for different message size.

---------

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
Co-authored-by: Qinghua Zhou <qinghuazhou@microsoft.com>
Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
This commit is contained in:
Binyang Li
2026-02-12 19:06:18 -08:00
committed by GitHub
parent dff3bc7bbb
commit bd68319e3e
43 changed files with 657 additions and 389 deletions

View File

@@ -343,8 +343,8 @@ public:
},
// Context key generation function
[self](const void* input, void* output,
size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->generateContextKey(input, output, inputSize, outputSize, dtype);
size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) {
return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
}
);
}

View File

@@ -107,9 +107,10 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
bool symmetricMemory) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
static_cast<ncclDataType_t>(dtype));
static_cast<ncclDataType_t>(dtype), symmetricMemory);
});
return allgatherAlgo;
}
@@ -191,7 +192,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
}
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
size_t outputSize, ncclDataType_t dtype) {
size_t outputSize, ncclDataType_t dtype, bool) {
return {(void*)input, output, inputSize, outputSize, 0};
}
};

View File

@@ -75,8 +75,9 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
bool symmetricMemory) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
});
return allgatherAlgo;
}
@@ -159,7 +160,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
}
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
size_t outputSize, mscclpp::DataType dtype) {
size_t outputSize, mscclpp::DataType dtype, bool) {
return {(void*)input, output, inputSize, outputSize, 0};
}
};

View File

@@ -15,7 +15,9 @@ import ipaddress
def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
return collection_builder.build_default_algorithms(
scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
scratch_buffer=scratch_buffer.data_ptr(),
scratch_buffer_size=scratch_buffer.nbytes,
rank=rank,
)

View File

@@ -96,11 +96,13 @@ class Algorithm {
/// @param executor The executor for DSL algorithms (may be nullptr for native).
/// @param nBlocks Number of CUDA blocks (0 for auto-selection).
/// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
/// @param symmetricMemory Whether to use symmetric memory optimization.
/// @param extras Additional parameters for algorithm-specific customization.
/// @return The result of the operation.
virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
bool symmetricMemory = false,
const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
/// Reset the algorithm state, clearing any cached contexts.
@@ -201,9 +203,10 @@ class NativeAlgorithm : public Algorithm {
/// @param inputSize Size of the input buffer.
/// @param outputSize Size of the output buffer.
/// @param dtype Data type of the elements.
/// @param symmetricMemory Whether symmetric memory is enabled.
/// @return A key uniquely identifying this buffer configuration.
using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype)>;
size_t outputSize, DataType dtype, bool symmetricMemory)>;
/// Construct a NativeAlgorithm.
/// @param name Human-readable name of the algorithm.
@@ -225,6 +228,7 @@ class NativeAlgorithm : public Algorithm {
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
bool symmetricMemory = false,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
const std::string& name() const override;
const std::string& collective() const override;
@@ -274,6 +278,7 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
bool symmetricMemory = false,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
AlgorithmType type() const override { return AlgorithmType::DSL; }
Constraint constraint() const override;
@@ -299,6 +304,7 @@ struct CollectiveRequest {
const void* inputBuffer;
void* outputBuffer;
size_t messageSize;
cudaStream_t stream;
const std::string& collective;
const DataType dtype;
const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
@@ -358,6 +364,10 @@ class AlgorithmCollection {
AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
};
/// Get a default GPU flag buffer (allocated once and reused).
/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer();
} // namespace mscclpp
#endif // MSCCLPP_ALGORITHM_HPP_

View File

@@ -98,12 +98,13 @@ class Env {
/// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`.
const std::string forceNcclFallbackOperation;
/// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs.
/// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with
/// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling
/// the channel cache won't affect the performance, but otherwise it may lead to performance degradation.
/// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory
/// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the
/// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections
/// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs
/// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator.
/// Default is false.
const bool disableChannelCache;
const bool ncclSymmetricMemory;
/// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++.
/// Default is false.

View File

@@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder {
/// @return The built AlgorithmCollection containing all registered algorithms.
AlgorithmCollection build();
AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer,
size_t flagBufferSize, int rank);
private:
AlgorithmCollectionBuilder() = default;
@@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder {
AlgoSelectFunc algoSelector_ = nullptr;
AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize,
uintptr_t flagBuffer, size_t flagBufferSize);
AlgorithmCollection buildDefaultDslAlgorithms(int rank);
static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;

View File

@@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t;
using cudaDeviceProp = hipDeviceProp_t;
using cudaStream_t = hipStream_t;
using cudaStreamCaptureMode = hipStreamCaptureMode;
using cudaStreamCaptureStatus = hipStreamCaptureStatus;
using cudaMemcpyKind = hipMemcpyKind;
using cudaIpcMemHandle_t = hipIpcMemHandle_t;
@@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone;
constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive;
constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated;
constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
@@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC
#define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
#define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
#define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__)
#define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
#define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
#define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)

View File

@@ -68,16 +68,17 @@ void register_algorithm(nb::module_& m) {
"execute",
[](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
std::unordered_map<std::string, uintptr_t> extras) {
return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
nBlocks, nThreadsPerBlock, extras);
nBlocks, nThreadsPerBlock, symmetricMemory, extras);
},
nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
.def("reset", &Algorithm::reset);
nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
.def(nb::init<>())
@@ -108,8 +109,17 @@ void register_algorithm(nb::module_& m) {
.def_prop_ro("output_buffer",
[](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
.def_ro("message_size", &CollectiveRequest::messageSize)
.def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.stream); })
.def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
.def_ro("dtype", &CollectiveRequest::dtype)
.def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
.def("buffer_mode", &CollectiveRequest::bufferMode);
m.def(
"cpp_get_default_flag_buffer",
[]() {
auto [buffer, size] = getDefaultFlagBuffer();
return std::make_pair(reinterpret_cast<uintptr_t>(buffer.get()), size);
},
"Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size).");
}

View File

@@ -44,7 +44,9 @@ void register_core(nb::module_& m) {
.value("uint32", DataType::UINT32)
.value("float16", DataType::FLOAT16)
.value("float32", DataType::FLOAT32)
.value("bfloat16", DataType::BFLOAT16);
.value("bfloat16", DataType::BFLOAT16)
.value("float8_e4m3", DataType::FP8_E4M3)
.value("float8_e5m2", DataType::FP8_E5M2);
nb::class_<Bootstrap>(m, "CppBootstrap")
.def("get_rank", &Bootstrap::getRank)

View File

@@ -29,6 +29,6 @@ void register_algorithm_collection_builder(nb::module_& m) {
nb::arg("selector"))
.def("build", &AlgorithmCollectionBuilder::build)
.def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
nb::arg("scratch_buffer_size"), nb::arg("rank"))
nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank"))
.def_static("reset", &AlgorithmCollectionBuilder::reset);
}

View File

@@ -4,6 +4,7 @@
from __future__ import annotations
from typing import Optional, Tuple, Dict
from functools import cached_property
import cupy as cp
from mscclpp._mscclpp import (
@@ -18,6 +19,7 @@ from mscclpp._mscclpp import (
CppReduceOp,
CppAlgorithmBuilder,
CppAlgorithmCollection,
cpp_get_default_flag_buffer,
)
__all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -160,6 +162,7 @@ class Algorithm:
executor: Optional[CppExecutor] = None,
nblocks=0,
nthreads_per_block=0,
symmetric_memory: bool = False,
extras: Optional[Dict[str, int]] = None,
) -> int:
"""Execute the collective algorithm.
@@ -176,6 +179,7 @@ class Algorithm:
executor: The executor for DSL algorithms (required for DSL, optional for native).
nblocks: Number of CUDA blocks (0 for auto-selection).
nthreads_per_block: Number of threads per block (0 for auto-selection).
symmetric_memory: Whether to use symmetric memory optimization (default: False).
extras: Additional algorithm-specific parameters.
Returns:
@@ -193,9 +197,14 @@ class Algorithm:
executor,
nblocks,
nthreads_per_block,
symmetric_memory,
extras if extras is not None else {},
)
def reset(self):
"""Reset the internal state of the algorithm, if applicable."""
self._algorithm.reset()
class AlgorithmBuilder:
def __init__(self, algorithm_builder: CppAlgorithmBuilder):
@@ -230,3 +239,17 @@ class AlgorithmCollection:
"""Register an algorithm for a collective operation."""
self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
self._algorithms.append(algorithm)
def get_default_flag_buffer() -> cp.ndarray:
"""Get the default flag buffer for algorithm selection.
This buffer is used internally by default algorithms to store selection flags.
It is allocated as a shared GPU buffer and can be accessed from Python.
Returns:
A CuPy array representing the flag buffer on the GPU.
"""
buffer_ptr, buffer_size = cpp_get_default_flag_buffer()
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0)
return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)

View File

@@ -3,7 +3,7 @@
from __future__ import annotations
from typing import Union
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer
import atexit
from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
@@ -29,6 +29,7 @@ class AlgorithmCollectionBuilder:
if not hasattr(self, "_initialized"):
self._builder = CppAlgorithmCollectionBuilder.get_instance()
self._initialized = True
self._flag_buffer = None
def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
if isinstance(algorithm_builder, AlgorithmBuilder):
@@ -50,8 +51,17 @@ class AlgorithmCollectionBuilder:
collection = self._builder.build()
return AlgorithmCollection(collection)
def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
def build_default_algorithms(
self,
scratch_buffer: int,
scratch_buffer_size: int,
rank: int,
) -> AlgorithmCollection:
if self._flag_buffer is None:
self._flag_buffer = get_default_flag_buffer()
native_collection = self._builder.build_default_algorithms(
int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
)
return AlgorithmCollection(native_collection)

View File

@@ -192,5 +192,11 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
return DataType.int32
elif dtype == torch.bfloat16:
return DataType.bfloat16
# Hardware supports either OCP format or FNUZ format for float8.
# Mapping both to the same MSCClPP data type.
elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz:
return DataType.float8_e5m2
elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz:
return DataType.float8_e4m3
else:
raise ValueError(f"Unknown data type: {dtype}")

View File

@@ -3,6 +3,7 @@
#include <filesystem>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/gpu_utils.hpp>
#include "logger.hpp"
@@ -40,12 +41,12 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>& extras) {
bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
if (!initialized_) {
initFunc_(comm);
initialized_ = true;
}
AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype);
AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory);
auto it = contexts_.find(ctxKey);
if (it == contexts_.end()) {
auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
@@ -155,7 +156,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
std::shared_ptr<Executor> executor, int, int,
std::shared_ptr<Executor> executor, int, int, bool,
const std::unordered_map<std::string, uintptr_t>&) {
if (!executor) {
THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
@@ -198,4 +199,18 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
// TODO: implement this
void DslAlgorithm::reset() {}
static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
static size_t gDefaultFlagCount = 128;
std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
if (!flagBuffer) {
flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
gDefaultFlagBuffer = flagBuffer;
}
return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
}
} // namespace mscclpp

View File

@@ -64,7 +64,7 @@ Env::Env()
cudaIpcUseDefaultStream(readEnv<bool>("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)),
ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
disableChannelCache(readEnv<bool>("MSCCLPP_DISABLE_CHANNEL_CACHE", false)),
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}
std::shared_ptr<Env> env() {
@@ -91,7 +91,7 @@ std::shared_ptr<Env> env() {
logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream);
logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath);
logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
logEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", globalEnv->disableChannelCache);
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
}
return globalEnv;

View File

@@ -49,8 +49,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::build() {
void AlgorithmCollectionBuilder::reset() { gAlgorithmCollectionBuilder_.reset(); }
AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t scratchBuffer,
size_t scratchBufferSize, int rank) {
auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize);
size_t scratchBufferSize, uintptr_t flagBuffer,
size_t flagBufferSize, int rank) {
auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize);
auto dslCollection = buildDefaultDslAlgorithms(rank);
nativeCollection.extend(dslCollection);
nativeCollection.setSelectors(algoSelector_, fallbackAlgoSelector_);
@@ -58,11 +59,15 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t
}
AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uintptr_t scratchBuffer,
size_t scratchBufferSize) {
size_t scratchBufferSize,
uintptr_t flagBuffer,
size_t flagBufferSize) {
AlgorithmCollection collection;
auto allreduceAllpairPkt = std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize)->build();
auto allreduceAllpairPkt =
std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
collection.registerAlgorithm(allreduceAllpairPkt->collective(), allreduceAllpairPkt->name(), allreduceAllpairPkt);
auto allreduceNvlsPacket = std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize)->build();
auto allreduceNvlsPacket =
std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket);
auto allreduceNvlsWithCopy = std::make_shared<AllreduceNvlsWithCopy>(scratchBuffer, scratchBufferSize)->build();
collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(),
@@ -70,7 +75,8 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uin
auto allreduceNvlsWithCopy2 = std::make_shared<AllreduceNvlsWithCopy2>(scratchBuffer, scratchBufferSize)->build();
collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(),
allreduceNvlsWithCopy2);
auto allreducePkt = std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize)->build();
auto allreducePkt =
std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt);
auto allreduceNvls = std::make_shared<AllreduceNvls>()->build();
collection.registerAlgorithm(allreduceNvls->collective(), allreduceNvls->name(), allreduceNvls);

View File

@@ -170,7 +170,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
return ctx;
}
AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType) {
AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType, bool) {
// always return same key, non-zero copy algo
return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
@@ -189,8 +189,9 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize,
DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
return self->generateAllgatherContextKey(input, output, inputSize, dtype);
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
bool symmetricMemory) {
return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}
} // namespace collective

View File

@@ -107,12 +107,6 @@ __global__ void __launch_bounds__(1024, 1)
}
}
AllgatherFullmesh2::AllgatherFullmesh2() : disableChannelCache_(false) {
if (mscclpp::env()->disableChannelCache) {
disableChannelCache_ = true;
}
}
void AllgatherFullmesh2::initialize(std::shared_ptr<Communicator> comm) {
this->conns_ = setupConnections(comm);
this->memorySemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
@@ -174,7 +168,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
if (disableChannelCache_) {
if (!symmetricMemory_) {
channelOutOffset = 0;
recvBytes = inputSize * comm->bootstrap()->getNranks();
recvBasePtr = (CUdeviceptr)output;
@@ -197,10 +191,11 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
}
mscclpp::AlgorithmCtxKey AllgatherFullmesh2::generateAllgatherContextKey(const void*, void* output, size_t,
mscclpp::DataType) {
mscclpp::DataType, bool symmetricMemory) {
static int tag = 0;
if (disableChannelCache_) {
// always return a new key if channel cache is disabled
symmetricMemory_ = symmetricMemory;
if (!symmetricMemory_) {
// always return a new key if symmetric memory is not enabled.
return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, tag++};
}
size_t recvBytes;
@@ -224,7 +219,9 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
[[maybe_unused]] size_t outputSize,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
mscclpp::DataType dtype) { return self->generateAllgatherContextKey(input, output, inputSize, dtype); });
mscclpp::DataType dtype, bool symmetricMemory) {
return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}
} // namespace collective

View File

@@ -11,29 +11,18 @@
namespace mscclpp {
namespace collective {
__device__ uint32_t deviceFlag = 1;
template <ReduceOp OpType, typename T, bool flagPerBlock = false>
template <ReduceOp OpType, typename T>
__global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags) {
int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
uint32_t flagSize) {
// This version of allreduce only works for single nodes
if (worldSize != nRanksPerNode) return;
if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
const int nPeers = nRanksPerNode - 1;
uint32_t flag = 0;
if constexpr (flagPerBlock) {
flag = ((uint32_t*)flags)[blockIdx.x];
} else {
flag = deviceFlag;
__syncthreads();
if (threadIdx.x == 0) {
((LL8Packet*)flags)[blockIdx.x].write(0, flag);
}
}
uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0;
size_t channelScratchOffset = scratchBaseOffset;
@@ -62,22 +51,12 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
}
dst[idx] = data;
}
if constexpr (flagPerBlock) {
__syncthreads();
if (threadIdx.x == 0) {
((uint32_t*)flags)[blockIdx.x] = flag + 1;
}
} else {
// Make sure all threadblocks have finished reading before incrementing the flag
if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
}
if (blockIdx.x == 0) {
__syncthreads();
}
if (threadIdx.x == 0 && blockIdx.x == 0) {
deviceFlag++;
}
__syncthreads();
if (threadIdx.x == 0) {
((uint32_t*)flags)[blockIdx.x] = flag + 1;
}
if (blockIdx.x == 0 && threadIdx.x >= gridDim.x && threadIdx.x < flagSize / sizeof(uint32_t)) {
((uint32_t*)flags)[threadIdx.x] = flag + 1;
}
}
@@ -93,19 +72,13 @@ struct AllpairAdapter {
static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0,
int nThreadsPerBlock = 0) {
using ChannelType = DeviceHandle<MemoryChannel>;
const size_t nelems = inputSize / sizeof(T);
if (nBlocks == 7 || nBlocks == 28) {
allreduceAllPairs<OpType, T, true><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
return cudaGetLastError();
}
allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
return cudaGetLastError();
}
};
@@ -116,12 +89,6 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
registeredMemories_.push_back(scratchMemory);
flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
std::vector<uint32_t> flags(28, 1);
flags7_ = detail::gpuCallocShared<uint32_t>(7);
flags28_ = detail::gpuCallocShared<uint32_t>(28);
gpuMemcpy<uint32_t>(flags7_.get(), flags.data(), 7, cudaMemcpyHostToDevice);
gpuMemcpy<uint32_t>(flags28_.get(), flags.data(), 28, cudaMemcpyHostToDevice);
}
CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
@@ -133,13 +100,6 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
}
void* flags = this->flags_.get();
if (blockAndThreadNum.first == 7) {
flags = this->flags7_.get();
} else if (blockAndThreadNum.first == 28) {
flags = this->flags28_.get();
}
size_t sendBytes;
CUdeviceptr sendBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -150,10 +110,11 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
return CommResult::CommInvalidArgument;
}
cudaError_t error = allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr,
nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank,
algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, flags,
this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
cudaError_t error =
allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr,
nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode,
algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
if (error != cudaSuccess) {
WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
return CommResult::CommUnhandledCudaError;
@@ -185,7 +146,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
return ctx;
}
AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
size_t sendBytes;
CUdeviceptr sendBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -193,7 +154,8 @@ AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void*
}
std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
flagBuffer_, flagBufferSize_);
return std::make_shared<NativeAlgorithm>(
"default_allreduce_allpair_packet", "allreduce",
[self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
@@ -206,8 +168,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
[self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize,
DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype);
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
bool symmetricMemory) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}
} // namespace collective

View File

@@ -149,7 +149,8 @@ struct AllreduceAllconnectAdapter {
static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize,
size_t inputSize, cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks,
int nThreadsPerBlock) {
using ChannelType = DeviceHandle<MemoryChannel>;
size_t nelems = inputSize / sizeof(T);
if (nBlocks == 0) nBlocks = 35;
@@ -180,8 +181,11 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
size_t channelOutOffset = 0;
if (symmetricMemory_) {
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
channelOutOffset = (char*)output - (char*)recvBasePtr;
}
std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles;
if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
inputChannelHandles = this->memoryChannelsMap_[input].second;
@@ -204,7 +208,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
cudaError_t error =
allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
if (error != cudaSuccess) {
WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error));
return CommResult::CommUnhandledCudaError;
@@ -212,19 +216,21 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
return CommResult::CommSuccess;
}
AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType) {
AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType,
bool symmetricMemory) {
static int tag = 0;
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
if (env()->disableChannelCache) {
symmetricMemory_ = symmetricMemory;
if (!symmetricMemory_) {
return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++};
}
return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
}
std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
void* output, size_t, DataType) {
void* output, size_t size, DataType) {
auto ctx = std::make_shared<AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
@@ -236,6 +242,10 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
if (!symmetricMemory_) {
recvBytes = size;
recvBasePtr = (CUdeviceptr)output;
}
RegisteredMemory localMemory = comm->registerMemory((void*)recvBasePtr, recvBytes, Transport::CudaIpc);
ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
@@ -258,8 +268,9 @@ std::shared_ptr<Algorithm> AllreduceFullmesh::build() {
[self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize,
DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype);
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
bool symmetricMemory) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}
} // namespace collective

View File

@@ -23,9 +23,18 @@ __global__ void __launch_bounds__(1024, 1)
int nBlocks = gridDim.x;
int bid = blockIdx.x;
size_t sizePerRank = size / nRanksPerNode;
size_t sizePerBlock = sizePerRank / nBlocks;
const size_t minAlign = 16;
// Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore
size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks;
sizePerBlock = (sizePerBlock + minAlign - 1) / minAlign * minAlign;
size_t rankOffset = sizePerRank * rank;
size_t blockOffset = sizePerBlock * bid + rankOffset;
size_t curBlockSize = 0;
if (sizePerBlock * bid < sizePerRank) {
curBlockSize = min(sizePerBlock, sizePerRank - sizePerBlock * bid);
}
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOutPtr = multicastOut + bid;
@@ -44,8 +53,10 @@ __global__ void __launch_bounds__(1024, 1)
__syncthreads();
T* src = (T*)multicastPtr->mcPtr;
T* dst = (T*)multicastOutPtr->mcPtr;
handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock,
threadIdx.x, blockDim.x);
if (curBlockSize > 0) {
handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize,
threadIdx.x, blockDim.x);
}
__syncthreads();
if (threadIdx.x < nPeers) {
channels[threadIdx.x].relaxedSignal();
@@ -60,7 +71,7 @@ struct NvlsAdapter {
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize,
cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
#if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
return cudaErrorNotSupported;
@@ -77,7 +88,12 @@ struct NvlsAdapter {
};
void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
nSwitchChannels_ = 8;
int device;
MSCCLPP_CUDATHROW(cudaGetDevice(&device));
cudaDeviceProp deviceProp;
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
computeCapabilityMajor_ = deviceProp.major;
nSwitchChannels_ = 32;
this->conns_ = setupConnections(comm);
// setup semaphores
std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -91,6 +107,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
size_t inputSize, mscclpp::DataType dtype, ReduceOp op,
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>&) {
if (!symmetricMemory_) {
WARN("AllreduceNvls requires symmetric memory for now.");
return CommResult::CommInvalidArgument;
}
auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype);
if (!allreduce) {
@@ -110,12 +130,16 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
}
std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
numBlocksAndThreads = {ctx->nRanksPerNode, 1024};
numBlocksAndThreads = {::min(ctx->nRanksPerNode, nSwitchChannels_), 1024};
// For GB200 devices, using more blocks to improve the performances when nRanksPerNode <= 8
if (computeCapabilityMajor_ == 10 && ctx->nRanksPerNode <= 8) {
numBlocksAndThreads.first = ::min(32, nSwitchChannels_);
}
}
cudaError_t error =
allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize,
inputSize, stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
if (error != cudaSuccess) {
WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
return CommResult::CommUnhandledCudaError;
@@ -124,7 +148,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
}
mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
mscclpp::DataType) {
mscclpp::DataType, bool symmetricMemory) {
symmetricMemory_ = symmetricMemory;
size_t sendBytes, recvBytes;
CUdeviceptr sendBasePtr, recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -174,7 +199,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
[[maybe_unused]] size_t outputSize,
mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
mscclpp::DataType dtype, bool symmetricMemory) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}
} // namespace collective
} // namespace mscclpp

View File

@@ -9,25 +9,15 @@
namespace mscclpp {
namespace collective {
__device__ uint32_t deviceFlag = 1;
template <ReduceOp OpType, typename T, bool flagPerBlock = false>
template <ReduceOp OpType, typename T>
__global__ void __launch_bounds__(1024, 1)
allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
[[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
[[maybe_unused]] size_t nelems, [[maybe_unused]] size_t scratchBufferSize,
[[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags) {
[[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags,
[[maybe_unused]] uint32_t flagBufferSize) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
uint32_t flag = 0;
if constexpr (flagPerBlock) {
flag = ((uint32_t*)flags)[blockIdx.x];
} else {
flag = deviceFlag;
__syncthreads();
if (threadIdx.x == 0) {
((LL8Packet*)flags)[blockIdx.x].write(0, flag);
}
}
uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
size_t scratchBaseOffset = (flag % 2) ? scratchBufferSize / 2 : 0;
uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
uint32_t nPktPerRank = nelems / worldSize / (sizeof(mscclpp::LL8Packet::Payload) / sizeof(T));
@@ -51,21 +41,13 @@ __global__ void __launch_bounds__(1024, 1)
}
dst[i] = data;
}
if constexpr (flagPerBlock) {
__syncthreads();
if (threadIdx.x == 0) {
((uint32_t*)flags)[blockIdx.x] = flag + 1;
}
} else {
if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
}
if (blockIdx.x == 0) {
__syncthreads();
}
if (threadIdx.x == 0 && blockIdx.x == 0) {
deviceFlag++;
}
__syncthreads();
if (threadIdx.x == 0) {
((uint32_t*)flags)[blockIdx.x] = flag + 1;
}
// update other flags in-case using different number of blocks in next launch
if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
((uint32_t*)flags)[threadIdx.x] = flag + 1;
}
#endif
}
@@ -85,30 +67,17 @@ struct AllreduceNvlsPacketAdapter {
static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream,
void* flags, uint32_t, int nBlocks, int nThreadsPerBlock) {
if (nBlocks == 4 || nBlocks == 8) {
allreduceNvlsPacket<OpType, T, true>
<<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
} else {
allreduceNvlsPacket<OpType, T>
<<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
}
void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) {
allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
worldSize, flags, flagBufferSize);
return cudaGetLastError();
}
};
void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {
std::vector<uint32_t> flags(8, 1);
flags_ = detail::gpuCallocShared<LL8Packet>(16);
flags4_ = detail::gpuCallocShared<uint32_t>(4);
flags8_ = detail::gpuCallocShared<uint32_t>(8);
gpuMemcpy<uint32_t>(flags4_.get(), flags.data(), 4, cudaMemcpyHostToDevice);
gpuMemcpy<uint32_t>(flags8_.get(), flags.data(), 8, cudaMemcpyHostToDevice);
}
void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {}
AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
@@ -146,16 +115,10 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
return CommResult::CommInvalidArgument;
}
void* flags = this->flags_.get();
if (blockAndThreadNum.first == 4) {
flags = this->flags4_.get();
} else if (blockAndThreadNum.first == 8) {
flags = this->flags8_.get();
}
cudaError_t error =
allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr,
0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, flags,
0, blockAndThreadNum.first, blockAndThreadNum.second);
0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
(void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
if (error != cudaSuccess) {
WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
return CommResult::CommUnhandledCudaError;
@@ -164,7 +127,8 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
}
std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_);
auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_, flagBuffer_,
flagBufferSize_);
return std::make_shared<mscclpp::NativeAlgorithm>(
"default_allreduce_nvls_packet", "allreduce",
[self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
@@ -178,7 +142,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
[[maybe_unused]] size_t outputSize,
mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
mscclpp::DataType dtype, bool symmetricMemory) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}
} // namespace collective
} // namespace mscclpp

View File

@@ -113,7 +113,7 @@ struct NvlsWithCopyAdapter {
static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
return cudaErrorNotSupported;
@@ -157,7 +157,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
}
cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
blockAndThreadNum.first, blockAndThreadNum.second);
if (error != cudaSuccess) {
WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
@@ -166,7 +166,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
return CommResult::CommSuccess;
}
AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
@@ -199,8 +199,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy::build() {
[self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize,
DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype);
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
bool symmetricMemory) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}
} // namespace collective

View File

@@ -150,7 +150,7 @@ struct NvlsWithCopy2Adapter {
static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
return cudaErrorNotSupported;
@@ -194,7 +194,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
}
cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
blockAndThreadNum.first, blockAndThreadNum.second);
if (error != cudaSuccess) {
WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
@@ -203,7 +203,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
return CommResult::CommSuccess;
}
AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
@@ -236,8 +236,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
[self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize,
DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype);
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
bool symmetricMemory) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}

View File

@@ -11,13 +11,11 @@
namespace mscclpp {
namespace collective {
__device__ uint32_t deviceFlag = 1;
template <ReduceOp OpType, typename T>
__global__ void __launch_bounds__(1024, 1)
allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
size_t nelems, void* flags, uint32_t numScratchBuff
size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff
#if defined(ENABLE_NPKIT)
,
NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
@@ -60,11 +58,7 @@ __global__ void __launch_bounds__(1024, 1)
const int nPeers = nRanksPerNode - 1;
const size_t nPkts = nelems / 2;
uint32_t flag = deviceFlag;
__syncthreads();
if (threadIdx.x == 0) {
((LL8Packet*)flags)[blockIdx.x].write(0, flag);
}
uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
size_t channelScratchOffset = (flag % numScratchBuff) ? scratchBufferSize / numScratchBuff : 0;
int nelemsPerRank = nelems / worldSize;
@@ -129,15 +123,12 @@ __global__ void __launch_bounds__(1024, 1)
result[idx].y = data.y;
}
// Make sure all threadblocks have finished reading before incrementing the flag
if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
__syncthreads();
if (threadIdx.x == 0) {
((uint32_t*)flags)[blockIdx.x] = flag + 1;
}
if (blockIdx.x == 0) {
__syncthreads();
}
if (threadIdx.x == 0 && blockIdx.x == 0) {
deviceFlag++;
if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
((uint32_t*)flags)[threadIdx.x] = flag + 1;
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \
defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
@@ -156,20 +147,22 @@ struct PacketAdapter {
static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
int nThreadsPerBlock = 0) {
cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff,
int nBlocks = 0, int nThreadsPerBlock = 0) {
using ChannelType = DeviceHandle<MemoryChannel>;
const size_t nelems = inputSize / sizeof(T);
// Optimize the number of blocks to be multiple of (worldSize - 1)
nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
#if defined(ENABLE_NPKIT)
size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, flags, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
NpKit::GetCpuTimestamp());
#else
allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, flags, numScratchBuff);
nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
#endif
return cudaGetLastError();
}
@@ -215,7 +208,6 @@ void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
registeredMemories_.push_back(scratchMemory);
flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
}
CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
@@ -233,7 +225,6 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
size_t channelInOffset = (char*)input - (char*)sendBasePtr;
void* flags = this->flags_.get();
AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype);
if (!allreduce) {
WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
@@ -242,7 +233,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
cudaError_t error =
allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr,
channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
stream, flags, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
blockAndThreadNum.first, blockAndThreadNum.second);
if (error != cudaSuccess) {
WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
return CommResult::CommUnhandledCudaError;
@@ -274,7 +266,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
return ctx;
}
AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
size_t sendBytes;
CUdeviceptr sendBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -282,7 +274,8 @@ AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input,
}
std::shared_ptr<Algorithm> AllreducePacket::build() {
auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
flagBuffer_, flagBufferSize_);
return std::make_shared<NativeAlgorithm>(
"default_allreduce_packet", "allreduce", [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
@@ -294,8 +287,9 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
[self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize,
DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype);
[self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
bool symmetricMemory) {
return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
});
}

View File

@@ -25,7 +25,7 @@ class AllgatherFullmesh : public AlgorithmBuilder {
std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
size_t, mscclpp::DataType);
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType, bool);
void* scratchBuffer_;
size_t scratchBufferSize_;

View File

@@ -11,11 +11,11 @@ namespace collective {
class AllgatherFullmesh2 : public AlgorithmBuilder {
public:
AllgatherFullmesh2();
AllgatherFullmesh2() = default;
std::shared_ptr<Algorithm> build() override;
private:
bool disableChannelCache_;
bool symmetricMemory_;
std::vector<Connection> conns_;
std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
const int nChannelsPerConnection_ = 35;
@@ -27,7 +27,7 @@ class AllgatherFullmesh2 : public AlgorithmBuilder {
std::shared_ptr<void> initAllgatherContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
DataType);
AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType);
AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType, bool);
};
} // namespace collective

View File

@@ -9,8 +9,11 @@ namespace mscclpp {
namespace collective {
class AllreduceAllpairPacket : public AlgorithmBuilder {
public:
AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
: scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
: scratchBuffer_((void*)scratchBuffer),
scratchBufferSize_(scratchBufferSize),
flagBuffer_(flagBuffer),
flagBufferSize_(flagBufferSize){};
std::shared_ptr<Algorithm> build() override;
private:
@@ -21,7 +24,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
void* scratchBuffer_;
size_t scratchBufferSize_;
@@ -30,9 +33,8 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
std::vector<Connection> conns_;
std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
std::vector<RegisteredMemory> registeredMemories_;
std::shared_ptr<LL8Packet> flags_;
std::shared_ptr<uint32_t> flags7_;
std::shared_ptr<uint32_t> flags28_;
uintptr_t flagBuffer_;
size_t flagBufferSize_;
};
} // namespace collective
} // namespace mscclpp

View File

@@ -20,7 +20,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
void* scratchBuffer_;
size_t scratchBufferSize_;
std::shared_ptr<Communicator> comm_;
@@ -32,6 +32,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
RegisteredMemory localScratchMemory_;
std::unordered_map<const void*, std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>
memoryChannelsMap_;
bool symmetricMemory_ = false;
};
} // namespace collective
} // namespace mscclpp

View File

@@ -12,6 +12,7 @@ class AllreduceNvls : public AlgorithmBuilder {
std::shared_ptr<Algorithm> build() override;
private:
bool symmetricMemory_ = false;
void initialize(std::shared_ptr<Communicator> comm);
CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
@@ -19,13 +20,14 @@ class AllreduceNvls : public AlgorithmBuilder {
std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
const size_t nvlsBufferSize_ = (1 << 30);
uint32_t nSwitchChannels_;
std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
std::vector<BaseMemoryChannel> baseChannels_;
std::vector<Connection> conns_;
int computeCapabilityMajor_{0};
};
} // namespace collective

View File

@@ -10,8 +10,11 @@ namespace mscclpp {
namespace collective {
class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
public:
AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
: scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
: scratchBuffer_((void*)scratchBuffer),
scratchBufferSize_(scratchBufferSize),
flagBuffer_(flagBuffer),
flagBufferSize_(flagBufferSize){};
std::shared_ptr<mscclpp::Algorithm> build() override;
private:
@@ -22,15 +25,14 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
std::shared_ptr<void> initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
size_t, mscclpp::DataType);
mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType);
mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType, bool);
void* scratchBuffer_;
size_t scratchBufferSize_;
const size_t nvlsBufferSize_ = (1 << 30);
const int maxBlockNum_ = 16;
std::shared_ptr<LL8Packet> flags_;
std::shared_ptr<uint32_t> flags4_;
std::shared_ptr<uint32_t> flags8_;
uintptr_t flagBuffer_;
size_t flagBufferSize_;
};
} // namespace collective
} // namespace mscclpp

View File

@@ -20,7 +20,7 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder {
std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
const size_t nvlsBufferSize_ = (1 << 30);
void* scratchBuffer_;

View File

@@ -23,7 +23,7 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {
std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
const size_t nvlsBufferSize_ = (1 << 30);
void* scratchBuffer_;

View File

@@ -9,8 +9,11 @@ namespace mscclpp {
namespace collective {
class AllreducePacket : public AlgorithmBuilder {
public:
AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
: scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
: scratchBuffer_((void*)scratchBuffer),
scratchBufferSize_(scratchBufferSize),
flagBuffer_(flagBuffer),
flagBufferSize_(flagBufferSize){};
std::shared_ptr<Algorithm> build() override;
private:
@@ -21,16 +24,17 @@ class AllreducePacket : public AlgorithmBuilder {
std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
void* scratchBuffer_;
size_t scratchBufferSize_;
const int nSegmentsForScratchBuffer_ = 2;
const int maxBlockNum_ = 56;
std::vector<Connection> conns_;
uintptr_t flagBuffer_;
size_t flagBufferSize_;
std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
std::vector<RegisteredMemory> registeredMemories_;
std::shared_ptr<LL8Packet> flags_;
};
} // namespace collective
} // namespace mscclpp

View File

@@ -75,7 +75,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
using AllreduceFunc =
std::function<cudaError_t(const void*, void*, void*, void*, void*, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
size_t, cudaStream_t, void*, uint32_t, int, int)>;
size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>;
template <template <ReduceOp, typename> class Adapter>
AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {

View File

@@ -31,7 +31,6 @@ constexpr int NUM_SEMAPHORES = 64;
constexpr int MAX_NRANKS_PER_NODE = 8;
constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
static bool mscclppDisableChannelCache = env()->disableChannelCache;
std::vector<RegisteredMemory> setupRemoteMemories(std::shared_ptr<Communicator> comm, int rank,
RegisteredMemory localMemory);

View File

@@ -0,0 +1,172 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "algorithm_selector.hpp"
#include <mscclpp/env.hpp>
#include <mscclpp/utils.hpp>
#include "debug.h"
namespace mscclpp {
namespace nccl {
static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, DataType dtype) {
bool nvlsSupported = config.nvlsSupported;
const bool isFp8 = dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2;
if (!isFp8) {
return nvlsSupported;
}
// FP8 handling
#if !defined(__HIP_PLATFORM_AMD__)
// NVLS does not support FP8 on devices with compute capability < 10
if (config.computeCapability.first < 10) {
return false;
}
#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
return true;
#else
return false;
#endif
#else
return nvlsSupported;
#endif
}
bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request) {
bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
bool collectiveMatch = algo->collective() == request.collective;
bool bufferModeMatch = algo->bufferMode() == CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
size_t effectiveSize =
(request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
bool minSizeMatch = effectiveSize >= algo->messageRange().first;
bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
bool result =
worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
return result;
}
static std::shared_ptr<Algorithm> selectSingleNodeAllreduceBlackwell(
const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
const AlgorithmSelectorConfig& config) {
const size_t messageSize = request.messageSize;
const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
// Small messages always use NVLS packet algorithm
if (messageSize <= (1 << 15)) { // <= 32KB
return algoMap.at("default_allreduce_nvls_packet");
}
if (!config.symmetricMemory) {
if (messageSize <= (1 << 21)) { // <= 2MB
return algoMap.at("default_allreduce_packet");
}
return nullptr;
}
// Symmetric memory path: can use cached memory handles
const bool useNvlsWithZeroCopy = nvlsSupported && config.isCuMemMapAllocated;
if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) { // <= 64KB or <= 1MB
return algoMap.at("default_allreduce_packet");
}
if (useNvlsWithZeroCopy) {
return algoMap.at("default_allreduce_nvls");
}
INFO(MSCCLPP_NCCL, "No suitable kernel for Blackwell architecture, fallback to nccl/rccl");
return nullptr;
}
std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
const AlgorithmSelectorConfig& config) {
// Use Blackwell-specific selection for compute capability 10.x
if (config.computeCapability.first == 10) {
return selectSingleNodeAllreduceBlackwell(algoMap, request, config);
}
const size_t messageSize = request.messageSize;
// Determine NVLS availability based on data type and device capability
const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
const bool useNvlsWithZeroCopy = nvlsSupported && config.symmetricMemory && config.isCuMemMapAllocated;
// Very small messages: use allpair packet algorithm
if (messageSize <= (1 << 14)) { // <= 16KB
return algoMap.at("default_allreduce_allpair_packet");
}
// Small messages with NVLS support
if (messageSize <= (1 << 15) && nvlsSupported) { // <= 32KB
return algoMap.at("default_allreduce_nvls_packet");
}
// Medium messages: use packet algorithm
if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) { // <= 64KB or <= 1MB
return algoMap.at("default_allreduce_packet");
}
// Large messages with NVLS zero-copy support
if (nvlsSupported && useNvlsWithZeroCopy) {
return algoMap.at("default_allreduce_nvls");
}
// Large messages with NVLS but without zero-copy
if (nvlsSupported) {
if (messageSize < (1 << 24)) { // < 16MB
return algoMap.at("default_allreduce_nvls_with_copy");
}
return algoMap.at("default_allreduce_nvls_with_copy2");
}
#if defined(__HIP_PLATFORM_AMD__)
// AMD platform: use fullmesh algorithm
return algoMap.at("default_allreduce_fullmesh");
#else
// NVIDIA without NVLS: use RSAG pipeline if no NCCL fallback
if (!config.ncclDlopenSharedLib) {
return algoMap.at("default_allreduce_fullmesh");
}
return nullptr;
#endif
}
std::shared_ptr<Algorithm> selectSingleNodeAllgather(
const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
[[maybe_unused]] const AlgorithmSelectorConfig& config) {
const size_t messageSize = request.messageSize;
// For messages up to 32MB, use fullmesh2 algorithm
if (messageSize <= 32 * (1 << 20)) {
return algoMap.at("default_allgather_fullmesh2");
}
#if defined(__HIP_PLATFORM_AMD__)
// AMD platform always uses fullmesh2
return algoMap.at("default_allgather_fullmesh2");
#else
// NVIDIA: use fullmesh for large messages if no NCCL fallback is available
if (!config.ncclDlopenSharedLib) {
return algoMap.at("default_allgather_fullmesh");
}
return nullptr;
#endif
}
std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap [[maybe_unused]],
const CollectiveRequest& request [[maybe_unused]], const AlgorithmSelectorConfig& config [[maybe_unused]]) {
// TODO: Implement multi-node algorithm selection
// Multi-node scenarios will need to consider:
// 1. Multi-node NVLS (if supported by hardware)
// 2. Multi-node IB (InfiniBand)
// 3. Hierarchical algorithms (intra-node + inter-node)
// 4. Network topology awareness
// For now, return nullptr to fallback to NCCL/RCCL
INFO(MSCCLPP_NCCL, "Multi-node collective not yet supported, fallback to nccl/rccl");
return nullptr;
}
} // namespace nccl
} // namespace mscclpp

View File

@@ -0,0 +1,48 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#ifndef MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
#define MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
#include <memory>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/core.hpp>
#include <unordered_map>
namespace mscclpp {
namespace nccl {
/// Configuration for algorithm selection
struct AlgorithmSelectorConfig {
bool symmetricMemory;
bool nvlsSupported;
bool isCuMemMapAllocated;
bool inCaptureMode;
std::pair<int, int> computeCapability;
bool ncclDlopenSharedLib;
};
/// Select an algorithm for single-node allreduce
std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
const AlgorithmSelectorConfig& config);
/// Select an algorithm for single-node allgather
std::shared_ptr<Algorithm> selectSingleNodeAllgather(
const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
const AlgorithmSelectorConfig& config);
/// Select an algorithm for multi-node collective operations
/// Currently returns nullptr to fallback to NCCL/RCCL
/// TODO: Implement multi-node NVLS and multi-node IB algorithms
std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
const AlgorithmSelectorConfig& config);
/// Check if an execution plan matches the request
bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request);
} // namespace nccl
} // namespace mscclpp
#endif // MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_

View File

@@ -9,6 +9,8 @@
#include <cstddef>
#include <mscclpp/gpu_data_types.hpp>
#include "logger.hpp"
// Convert ncclDataType_t to mscclpp::DataType
inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
switch (dtype) {
@@ -70,8 +72,8 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
return ncclFloat8e5m2;
#endif
default:
assert(false && "Unsupported mscclpp::DataType");
return ncclNumTypes;
THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
"Unsupported mscclpp::DataType: " + std::to_string(static_cast<int>(dtype)));
}
}

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <algorithm>
#include <filesystem>
@@ -19,8 +19,10 @@
#include <mscclpp/algorithm.hpp>
#include "algorithm_selector.hpp"
#include "datatype_conversion.hpp"
#include "debug.h"
static constexpr auto MSCCLPP_NCCL = mscclpp::LogSubsys::NCCL;
#define NCCL_API extern "C" __attribute__((visibility("default")))
@@ -80,17 +82,17 @@ static inline int mscclppNcclDlopenInit() {
const char* ncclLibPath = mscclpp::env()->ncclSharedLibPath.c_str();
if (ncclLibPath != nullptr && ncclLibPath[0] != '\0') {
if (std::filesystem::is_directory(ncclLibPath)) {
WARN("The value of the environment variable %s is a directory", ncclLibPath);
WARN(MSCCLPP_NCCL, "The value of the environment variable %s is a directory", ncclLibPath);
return dlopenError;
}
mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE);
if (!mscclppNcclDlHandle) {
WARN("Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
return dlopenError;
}
} else {
WARN("The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
return dlopenError;
}
@@ -179,7 +181,7 @@ struct ncclComm {
NCCL_API ncclResult_t ncclGetVersion(int* version) {
if (version == nullptr) {
WARN("version is nullptr");
WARN(MSCCLPP_NCCL, "version is nullptr");
return ncclInvalidArgument;
}
*version = MSCCLPP_VERSION;
@@ -188,7 +190,7 @@ NCCL_API ncclResult_t ncclGetVersion(int* version) {
NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) {
if (uniqueId == nullptr) {
WARN("uniqueId is nullptr");
WARN(MSCCLPP_NCCL, "uniqueId is nullptr");
return ncclInvalidArgument;
}
if (mscclpp::UniqueIdBytes != NCCL_UNIQUE_ID_BYTES) return ncclInternalError;
@@ -212,21 +214,6 @@ static std::pair<int, int> getDeviceComputeCapability() {
return std::make_pair(major, minor);
}
static bool matchExecutionPlan(std::shared_ptr<mscclpp::DslAlgorithm> algo, const mscclpp::CollectiveRequest& request) {
bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
bool collectiveMatch = algo->collective() == request.collective;
bool bufferModeMatch =
algo->bufferMode() == mscclpp::CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
size_t effectiveSize =
(request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
bool minSizeMatch = effectiveSize >= algo->messageRange().first;
bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
bool result =
worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
return result;
}
static std::shared_ptr<mscclpp::Algorithm> algoSelector(
const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<mscclpp::Algorithm>>>&
algoMapByCollective,
@@ -234,86 +221,64 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
if (algoMapByCollective.find(request.collective) == algoMapByCollective.end()) {
return nullptr;
}
for (const auto& pair : algoMapByCollective.at(request.collective)) {
const auto& algo = pair.second;
if (algo->type() == mscclpp::AlgorithmType::DSL) {
if (matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
if (mscclpp::nccl::matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
return algo;
}
}
}
if (request.nRanksPerNode != request.worldSize) {
// Fallback to nccl/rccl when multi-node
return nullptr;
}
static const bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache;
// Prepare algorithm selector configuration
static const bool isNvlsSupported = mscclpp::isNvlsSupported();
static const std::pair<int, int> deviceComputeCapability = getDeviceComputeCapability();
size_t messageSize = request.messageSize;
const std::string& collective = request.collective;
bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
mscclpp::isCuMemMapAllocated(request.outputBuffer);
bool useNvlsWithZeroCopy = isNvlsSupported && !mscclppDisableChannelCache && isCuMemMapAllocated;
if (collective == "allgather") {
if (messageSize <= 32 * (1 << 20)) {
return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
} else {
#if defined(__HIP_PLATFORM_AMD__)
return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
#else
if (!mscclppNcclDlopenSharedLib) {
return algoMapByCollective.at(collective).at("default_allgather_fullmesh");
}
#endif
}
static const bool ncclSymmetricMemory = mscclpp::env()->ncclSymmetricMemory;
const bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
mscclpp::isCuMemMapAllocated(request.outputBuffer);
cudaStreamCaptureStatus captureStatus = cudaStreamCaptureStatusNone;
CUDACHECK(cudaStreamIsCapturing(request.stream, &captureStatus));
const bool inCaptureMode = (captureStatus == cudaStreamCaptureStatusActive);
mscclpp::nccl::AlgorithmSelectorConfig config{.symmetricMemory = ncclSymmetricMemory,
.nvlsSupported = isNvlsSupported,
.isCuMemMapAllocated = isCuMemMapAllocated,
.inCaptureMode = inCaptureMode,
.computeCapability = deviceComputeCapability,
.ncclDlopenSharedLib = mscclppNcclDlopenSharedLib};
const auto& algoMap = algoMapByCollective.at(request.collective);
// Check if this is a multi-node scenario
if (request.nRanksPerNode != request.worldSize) {
return mscclpp::nccl::selectMultiNodeAlgorithm(algoMap, request, config);
}
if (collective == "allreduce") {
bool useNvls = isNvlsSupported;
bool isFp8 = request.dtype == mscclpp::DataType::FP8_E4M3 || request.dtype == mscclpp::DataType::FP8_E5M2;
#if !defined(__HIP_PLATFORM_AMD__)
if (isFp8 && deviceComputeCapability.first < 10) {
// NVLS does not support FP8 on devices with compute capability < 10
useNvls = false;
}
#endif
if (messageSize <= (1 << 15) && useNvls) {
return algoMapByCollective.at(collective).at("default_allreduce_nvls_packet");
}
if (messageSize <= (1 << 14)) {
return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");
}
if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {
return algoMapByCollective.at(collective).at("default_allreduce_packet");
}
if (useNvls && useNvlsWithZeroCopy) {
return algoMapByCollective.at(collective).at("default_allreduce_nvls");
}
if (useNvls && messageSize < (1 << 24)) {
return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy");
}
if (useNvls && messageSize >= (1 << 24)) {
return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy2");
}
#if defined(__HIP_PLATFORM_AMD__)
return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
#else
if (!mscclppNcclDlopenSharedLib) {
return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
}
#endif
// Single-node scenarios
if (request.collective == "allgather") {
return mscclpp::nccl::selectSingleNodeAllgather(algoMap, request, config);
}
INFO(MSCCLPP_NCCL, "Failed to get algo from customized kernel, fallback to nccl/rccl");
if (request.collective == "allreduce") {
return mscclpp::nccl::selectSingleNodeAllreduce(algoMap, request, config);
}
INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '%s', fallback to nccl/rccl",
request.collective.c_str());
return nullptr;
}
NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
if (comm == nullptr) {
WARN("comm is nullptr");
WARN(MSCCLPP_NCCL, "comm is nullptr");
return ncclInvalidArgument;
}
if (nranks < 0 || rank < 0 || rank >= nranks) {
WARN("nranks is %d, rank is %d", nranks, rank);
WARN(MSCCLPP_NCCL, "nranks is %d, rank is %d", nranks, rank);
return ncclInvalidArgument;
}
std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
@@ -327,12 +292,15 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
auto [flagBuffer, flagBufferSize] = mscclpp::getDefaultFlagBuffer();
commPtr->nRanksPerNode = mscclppComm->bootstrap()->getNranksPerNode();
commPtr->worldSize = mscclppComm->bootstrap()->getNranks();
auto algoBuilder = mscclpp::collective::AlgorithmCollectionBuilder::getInstance();
algoBuilder->setFallbackAlgorithmSelector(algoSelector);
commPtr->algorithmCollection = algoBuilder->buildDefaultAlgorithms(
reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_, rank);
reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_,
reinterpret_cast<uintptr_t>(flagBuffer.get()), flagBufferSize, rank);
// Extend with user-defined algorithms
commPtr->algorithmCollection.extend(algoBuilder->build());
@@ -346,7 +314,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
const std::string ncclLibPath = mscclpp::env()->ncclSharedLibPath;
if (!ncclLibPath.empty() && !mscclppNcclDlopenSharedLib) {
if (!tryLoadNcclSharedLib()) {
WARN("Failed to load the shared library for nccl/rccl");
WARN(MSCCLPP_NCCL, "Failed to load the shared library for nccl/rccl");
return ncclInternalError;
}
}
@@ -361,7 +329,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
commPtr->mscclppNcclComm = new ncclComm_t();
if (commPtr->mscclppNcclComm == nullptr) {
WARN("Failed to allocate memory for mscclppNcclComm");
WARN(MSCCLPP_NCCL, "Failed to allocate memory for mscclppNcclComm");
return ncclInternalError;
}
mscclppNcclOps.CommInitRank(reinterpret_cast<ncclComm_t*>(commPtr->mscclppNcclComm), nranks, mscclppNcclUniqueId,
@@ -378,7 +346,7 @@ NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int*) {
return ncclCommInitRank(comm, ndev, Id, 0);
}
// TODO: implement this function
WARN("ncclCommInitAll is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclCommInitAll is currently unavailable");
return ncclInternalError;
}
@@ -389,7 +357,7 @@ NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) {
NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == nullptr) {
WARN("comm is nullptr");
WARN(MSCCLPP_NCCL, "comm is nullptr");
return ncclInvalidArgument;
}
#if defined(ENABLE_NPKIT)
@@ -447,7 +415,7 @@ NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclCom
}
ncclResult_t ncclCommInitRankScalable(ncclComm_t*, int, int, int, ncclUniqueId*, ncclConfig_t*) {
WARN("ncclCommInitRankScalable is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclCommInitRankScalable is currently unavailable");
return ncclInternalError;
}
@@ -481,7 +449,7 @@ NCCL_API const char* ncclGetLastError(ncclComm_t) {
NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError) {
if (asyncError == nullptr) {
WARN("asyncError is nullptr");
WARN(MSCCLPP_NCCL, "asyncError is nullptr");
return ncclInvalidArgument;
}
*asyncError = ncclSuccess;
@@ -490,7 +458,7 @@ NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError
NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
if (comm == nullptr || count == nullptr) {
WARN("comm is nullptr or count is nullptr");
WARN(MSCCLPP_NCCL, "comm is nullptr or count is nullptr");
return ncclInvalidArgument;
}
*count = comm->comm->bootstrap()->getNranks();
@@ -499,7 +467,7 @@ NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
if (comm == nullptr || device == nullptr) {
WARN("comm is nullptr or device is nullptr");
WARN(MSCCLPP_NCCL, "comm is nullptr or device is nullptr");
return ncclInvalidArgument;
}
*device = comm->comm->bootstrap()->getRank();
@@ -508,7 +476,7 @@ NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
if (comm == nullptr || rank == nullptr) {
WARN("comm is nullptr or rank is nullptr");
WARN(MSCCLPP_NCCL, "comm is nullptr or rank is nullptr");
return ncclInvalidArgument;
}
@@ -521,24 +489,24 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
}
NCCL_API ncclResult_t ncclCommWindowRegister(ncclComm_t, void*, size_t, ncclWindow_t*, int) {
WARN("ncclCommWindowRegister is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclCommWindowRegister is currently unavailable");
return ncclInternalError;
}
NCCL_API ncclResult_t ncclCommWindowDeregister(ncclComm_t, ncclWindow_t) {
WARN("ncclCommWindowDeregister is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclCommWindowDeregister is currently unavailable");
return ncclInternalError;
}
NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, ncclScalarResidence_t, ncclComm_t) {
// TODO: implement this function
WARN("ncclRedOpCreatePreMulSum is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclRedOpCreatePreMulSum is currently unavailable");
return ncclInternalError;
}
NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
// TODO: implement this function
WARN("ncclRedOpDestroy is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclRedOpDestroy is currently unavailable");
return ncclInternalError;
}
@@ -549,7 +517,7 @@ NCCL_API ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t co
return mscclppNcclOps.Reduce(sendbuff, recvbuff, count, datatype, op, root,
*reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
}
WARN("ncclReduce is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclReduce is currently unavailable");
return ncclInternalError;
}
@@ -569,9 +537,9 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
}
int rank = comm->comm->bootstrap()->getRank();
if ((sendbuff == nullptr && root == rank) || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
WARN(MSCCLPP_NCCL,
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
return ncclInvalidArgument;
}
@@ -587,12 +555,15 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
static std::unordered_map<std::string, std::vector<uint64_t>> hints{{"root", {static_cast<uint64_t>(root)}}};
hints["root"][0] = static_cast<uint64_t>(root);
const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
.nRanksPerNode = comm->nRanksPerNode,
.rank = rank,
.inputBuffer = sendbuff,
.outputBuffer = recvbuff,
.messageSize = bytes,
.stream = stream,
.collective = "broadcast",
.dtype = dtype,
.hints = hints};
@@ -600,7 +571,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
if (algo != nullptr) {
std::unordered_map<std::string, uintptr_t> extras{{"root", reinterpret_cast<uintptr_t>(&root)}};
return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0, extras));
mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
symmetricMemory, extras));
}
if (mscclppNcclDlopenSharedLib == true) {
@@ -608,7 +580,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
*reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
}
WARN("No FallBack implementation for broadcast");
WARN(MSCCLPP_NCCL, "No FallBack implementation for broadcast");
return ncclInvalidUsage;
}
@@ -623,9 +595,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
}
// Checking if the parameters are valids
if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
"datatype is invalid, or comm is nullptr.");
WARN(MSCCLPP_NCCL,
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
"datatype is invalid, or comm is nullptr.");
return ncclInvalidArgument;
}
// Declarating variables
@@ -639,12 +611,14 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
*reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
}
mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
.nRanksPerNode = comm->nRanksPerNode,
.rank = rank,
.inputBuffer = sendbuff,
.outputBuffer = recvbuff,
.messageSize = bytes,
.stream = stream,
.collective = "allreduce",
.dtype = dtype,
.hints = {}};
@@ -652,7 +626,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
auto algo = comm->algorithmCollection.selectAlgorithm(request);
if (algo != nullptr) {
return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
ncclRedOpToMscclpp(reductionOperation), stream, comm->executor));
ncclRedOpToMscclpp(reductionOperation), stream, comm->executor, 0, 0,
symmetricMemory));
}
if (mscclppNcclDlopenSharedLib == true) {
@@ -660,7 +635,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
*reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
}
WARN("No FallBack implementation for AllReduce");
WARN(MSCCLPP_NCCL, "No FallBack implementation for AllReduce");
return ncclInvalidUsage;
}
@@ -675,9 +650,9 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
}
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
WARN(MSCCLPP_NCCL,
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
return ncclInvalidArgument;
}
@@ -693,19 +668,22 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
int rank = comm->comm->bootstrap()->getRank();
int nRank = comm->comm->bootstrap()->getNranks();
mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
.nRanksPerNode = comm->nRanksPerNode,
.rank = rank,
.inputBuffer = sendbuff,
.outputBuffer = recvbuff,
.messageSize = bytes * nRank,
.stream = stream,
.collective = "reducescatter",
.dtype = dtype,
.hints = {}};
auto algo = comm->algorithmCollection.selectAlgorithm(request);
if (algo != nullptr) {
return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes * nRank, bytes, dtype,
ncclRedOpToMscclpp(op), stream, comm->executor));
ncclRedOpToMscclpp(op), stream, comm->executor, 0, 0,
symmetricMemory));
}
if (mscclppNcclDlopenSharedLib == true) {
@@ -713,7 +691,7 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
*reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
}
WARN("No FallBack implementation for ReduceScatter");
WARN(MSCCLPP_NCCL, "No FallBack implementation for ReduceScatter");
return ncclInternalError;
}
@@ -727,9 +705,9 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
return ncclSuccess;
}
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
WARN(MSCCLPP_NCCL,
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
return ncclInvalidArgument;
}
@@ -745,12 +723,14 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
}
mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
.nRanksPerNode = comm->nRanksPerNode,
.rank = rank,
.inputBuffer = sendbuff,
.outputBuffer = recvbuff,
.messageSize = bytes,
.stream = stream,
.collective = "allgather",
.dtype = dtype,
.hints = {}};
@@ -758,7 +738,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
auto algo = comm->algorithmCollection.selectAlgorithm(request);
if (algo != nullptr) {
return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes * nRank, dtype,
mscclpp::ReduceOp::NOP, stream, comm->executor));
mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
symmetricMemory));
}
if (mscclppNcclDlopenSharedLib == true) {
@@ -766,7 +747,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
*reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
}
WARN("No FallBack implementation for AllGather");
WARN(MSCCLPP_NCCL, "No FallBack implementation for AllGather");
return ncclInvalidUsage;
}
@@ -776,7 +757,7 @@ NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_
return mscclppNcclOps.Send(sendbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
stream);
}
WARN("ncclSend is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclSend is currently unavailable");
return ncclInternalError;
}
@@ -786,7 +767,7 @@ NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t data
return mscclppNcclOps.Recv(recvbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
stream);
}
WARN("ncclRecv is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclRecv is currently unavailable");
return ncclInternalError;
}
@@ -800,7 +781,7 @@ NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t
return ncclSuccess;
}
// TODO: implement this function
WARN("ncclAllToAll is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclAllToAll is currently unavailable");
return ncclInternalError;
}
@@ -815,7 +796,7 @@ NCCL_API ncclResult_t ncclAllToAllv(const void* sendbuff, [[maybe_unused]] const
cudaMemcpyDeviceToDevice, stream));
return ncclSuccess;
}
WARN("ncclAllToAllv is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclAllToAllv is currently unavailable");
return ncclInternalError;
}
@@ -824,7 +805,7 @@ NCCL_API ncclResult_t ncclGroupStart() {
if (mscclppNcclDlopenSharedLib == true) {
return mscclppNcclOps.GroupStart();
}
WARN("ncclGroupStart is currently unavailable, return success");
WARN(MSCCLPP_NCCL, "ncclGroupStart is currently unavailable, return success");
return ncclSuccess;
}
@@ -832,56 +813,56 @@ NCCL_API ncclResult_t ncclGroupEnd() {
if (mscclppNcclDlopenSharedLib == true) {
return mscclppNcclOps.GroupEnd();
}
WARN("ncclGroupEnd is currently unavailable, return success");
WARN(MSCCLPP_NCCL, "ncclGroupEnd is currently unavailable, return success");
return ncclSuccess;
}
NCCL_API ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t*) {
// TODO: implement this function
WARN("ncclGroupSimulateEnd is not implemented");
WARN(MSCCLPP_NCCL, "ncclGroupSimulateEnd is not implemented");
return ncclInternalError;
}
NCCL_API ncclResult_t ncclCommRegister(const ncclComm_t, void*, size_t, void**) {
// TODO: Implementation
WARN("ncclCommRegister is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclCommRegister is currently unavailable");
return ncclInternalError;
}
NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
// TODO: Implementation
WARN("ncclCommDeregister is currently unavailable");
WARN(MSCCLPP_NCCL, "ncclCommDeregister is currently unavailable");
return ncclInternalError;
}
ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
if (ptr == nullptr || size == 0) {
WARN("ptr is nullptr or size is 0");
WARN(MSCCLPP_NCCL, "ptr is nullptr or size is 0");
return ncclInvalidArgument;
}
std::shared_ptr<char> sharedPtr;
try {
sharedPtr = mscclpp::GpuBuffer(size).memory();
if (sharedPtr == nullptr) {
WARN("Failed to allocate memory via ncclMemAlloc");
WARN(MSCCLPP_NCCL, "Failed to allocate memory via ncclMemAlloc");
return ncclSystemError;
}
} catch (const mscclpp::Error& e) {
if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
WARN("Invalid usage: %s", e.what());
WARN(MSCCLPP_NCCL, "Invalid usage: %s", e.what());
return ncclInvalidUsage;
} else {
WARN("Internal error: %s", e.what());
WARN(MSCCLPP_NCCL, "Internal error: %s", e.what());
return ncclInternalError;
}
} catch (const mscclpp::CudaError& e) {
WARN("Cuda error: %s", e.what());
WARN(MSCCLPP_NCCL, "Cuda error: %s", e.what());
return ncclUnhandledCudaError;
} catch (const mscclpp::CuError& e) {
WARN("Cu error: %s", e.what());
WARN(MSCCLPP_NCCL, "Cu error: %s", e.what());
return ncclUnhandledCudaError;
} catch (const mscclpp::BaseError& e) {
WARN("Base error: %s", e.what());
WARN(MSCCLPP_NCCL, "Base error: %s", e.what());
return ncclInternalError;
}
ptrMap[sharedPtr.get()] = sharedPtr;
@@ -899,6 +880,6 @@ ncclResult_t ncclMemFree(void* ptr) {
}
// Pointer not found
WARN("Pointer not found");
WARN(MSCCLPP_NCCL, "Pointer not found");
return ncclInvalidUsage;
}

View File

@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT License.
# run with:
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_DISABLE_CHANNEL_CACHE=true torchrun --nproc_per_node=8 ./allreduce_temp_buff.py
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_NCCL_SYMMETRIC_MEMORY=false torchrun --nproc_per_node=8 ./allreduce_temp_buff.py
import os
import torch

View File

@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_DISABLE_CHANNEL_CACHE=true torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_NCCL_SYMMETRIC_MEMORY=false torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
import os, sys
import torch
import torch.distributed as dist