Refactor algo selection logic and introduce symmetric_memory env (#741)

This PR refactors the algorithm selection logic in MSCCL++ and introduces support for symmetric memory configuration through environment variables. 1. Algorithm Selection Refactoring Use separate class for algo selection. Could introduce more complex logic for algo selection based on message size, arch, if cuda graph is enabled and memory allocation method 2. Symmetric Memory Support Introduced symmetricMemory parameter in algorithm context key generation. Remove disableChannelCache env as is ambiguous 3. Add new args for build_default_algorithms Add flag_buffer, and flag_buffer_size args to build default algorithm. Then we could use unified flag buffer for different algorithms, avoid application hanging when switch algo for different message size. --------- Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com> Co-authored-by: Qinghua Zhou <qinghuazhou@microsoft.com> Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
2026-05-24 14:54:51 +00:00 · 2026-02-12 19:06:18 -08:00
parent dff3bc7bbb
commit bd68319e3e
43 changed files with 657 additions and 389 deletions
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -3,6 +3,7 @@

 #include <filesystem>
 #include <mscclpp/algorithm.hpp>
+#include <mscclpp/gpu_utils.hpp>

 #include "logger.hpp"

@@ -40,12 +41,12 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
 CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
                                    size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
                                    cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
-                                    const std::unordered_map<std::string, uintptr_t>& extras) {
+                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
  if (!initialized_) {
    initFunc_(comm);
    initialized_ = true;
  }
-  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype);
+  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory);
  auto it = contexts_.find(ctxKey);
  if (it == contexts_.end()) {
    auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
@@ -155,7 +156,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }

 CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                                 size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
-                                 std::shared_ptr<Executor> executor, int, int,
+                                 std::shared_ptr<Executor> executor, int, int, bool,
                                 const std::unordered_map<std::string, uintptr_t>&) {
  if (!executor) {
    THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
@@ -198,4 +199,18 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
 // TODO: implement this
 void DslAlgorithm::reset() {}

+static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
+static size_t gDefaultFlagCount = 128;
+
+std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
+  std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
+  if (!flagBuffer) {
+    flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
+    std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
+    mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
+    gDefaultFlagBuffer = flagBuffer;
+  }
+  return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
+}
+
 }  // namespace mscclpp