Refactor algo selection logic and introduce symmetric_memory env (#741)

This PR refactors the algorithm selection logic in MSCCL++ and introduces support for symmetric memory configuration through environment variables. 1. Algorithm Selection Refactoring Use separate class for algo selection. Could introduce more complex logic for algo selection based on message size, arch, if cuda graph is enabled and memory allocation method 2. Symmetric Memory Support Introduced symmetricMemory parameter in algorithm context key generation. Remove disableChannelCache env as is ambiguous 3. Add new args for build_default_algorithms Add flag_buffer, and flag_buffer_size args to build default algorithm. Then we could use unified flag buffer for different algorithms, avoid application hanging when switch algo for different message size. --------- Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com> Co-authored-by: Qinghua Zhou <qinghuazhou@microsoft.com> Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
2026-04-20 06:49:29 +00:00 · 2026-02-12 19:06:18 -08:00
parent dff3bc7bbb
commit bd68319e3e
43 changed files with 657 additions and 389 deletions
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -343,8 +343,8 @@ public:
            },
            // Context key generation function
            [self](const void* input, void* output,
-                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-                return self->generateContextKey(input, output, inputSize, outputSize, dtype);
+                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) {
+                return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
            }
        );
    }
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -107,9 +107,10 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
        [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
               size_t outputSize,
               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
          return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
-                                                   static_cast<ncclDataType_t>(dtype));
+                                                   static_cast<ncclDataType_t>(dtype), symmetricMemory);
        });
    return allgatherAlgo;
  }
@@ -191,7 +192,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
  }

  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, ncclDataType_t dtype) {
+                                                       size_t outputSize, ncclDataType_t dtype, bool) {
    return {(void*)input, output, inputSize, outputSize, 0};
  }
 };
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -75,8 +75,9 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
        [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
               size_t outputSize,
               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
+          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
        });
    return allgatherAlgo;
  }
@@ -159,7 +160,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
  }

  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, mscclpp::DataType dtype) {
+                                                       size_t outputSize, mscclpp::DataType dtype, bool) {
    return {(void*)input, output, inputSize, outputSize, 0};
  }
 };
--- a/examples/torch-integration/customized_comm_with_default_algo.py
+++ b/examples/torch-integration/customized_comm_with_default_algo.py
@@ -15,7 +15,9 @@ import ipaddress
 def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
    collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
    return collection_builder.build_default_algorithms(
-        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+        scratch_buffer=scratch_buffer.data_ptr(),
+        scratch_buffer_size=scratch_buffer.nbytes,
+        rank=rank,
    )


--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -96,11 +96,13 @@ class Algorithm {
  /// @param executor The executor for DSL algorithms (may be nullptr for native).
  /// @param nBlocks Number of CUDA blocks (0 for auto-selection).
  /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
+  /// @param symmetricMemory Whether to use symmetric memory optimization.
  /// @param extras Additional parameters for algorithm-specific customization.
  /// @return The result of the operation.
  virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                             size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                             std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                             bool symmetricMemory = false,
                             const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;

  /// Reset the algorithm state, clearing any cached contexts.
@@ -201,9 +203,10 @@ class NativeAlgorithm : public Algorithm {
  /// @param inputSize Size of the input buffer.
  /// @param outputSize Size of the output buffer.
  /// @param dtype Data type of the elements.
+  /// @param symmetricMemory Whether symmetric memory is enabled.
  /// @return A key uniquely identifying this buffer configuration.
  using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
-                                                          size_t outputSize, DataType dtype)>;
+                                                          size_t outputSize, DataType dtype, bool symmetricMemory)>;

  /// Construct a NativeAlgorithm.
  /// @param name Human-readable name of the algorithm.
@@ -225,6 +228,7 @@ class NativeAlgorithm : public Algorithm {
  CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                     size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                     std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                     bool symmetricMemory = false,
                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
  const std::string& name() const override;
  const std::string& collective() const override;
@@ -274,6 +278,7 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
  CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                     size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                     std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                     bool symmetricMemory = false,
                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
  AlgorithmType type() const override { return AlgorithmType::DSL; }
  Constraint constraint() const override;
@@ -299,6 +304,7 @@ struct CollectiveRequest {
  const void* inputBuffer;
  void* outputBuffer;
  size_t messageSize;
+  cudaStream_t stream;
  const std::string& collective;
  const DataType dtype;
  const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
@@ -358,6 +364,10 @@ class AlgorithmCollection {
  AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
 };

+/// Get a default GPU flag buffer (allocated once and reused).
+/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
+std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer();
+
 }  // namespace mscclpp

 #endif  // MSCCLPP_ALGORITHM_HPP_
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -98,12 +98,13 @@ class Env {
  /// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`.
  const std::string forceNcclFallbackOperation;

-  /// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs.
-  /// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with
-  /// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling
-  /// the channel cache won't affect the performance, but otherwise it may lead to performance degradation.
+  /// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory
+  /// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the
+  /// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections
+  /// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs
+  /// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator.
  /// Default is false.
-  const bool disableChannelCache;
+  const bool ncclSymmetricMemory;

  /// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++.
  /// Default is false.
--- a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
+++ b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
@@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder {
  /// @return The built AlgorithmCollection containing all registered algorithms.
  AlgorithmCollection build();

-  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
+  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer,
+                                             size_t flagBufferSize, int rank);

 private:
  AlgorithmCollectionBuilder() = default;
@@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder {
  AlgoSelectFunc algoSelector_ = nullptr;
  AlgoSelectFunc fallbackAlgoSelector_ = nullptr;

-  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
+  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize,
+                                                   uintptr_t flagBuffer, size_t flagBufferSize);
  AlgorithmCollection buildDefaultDslAlgorithms(int rank);

  static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t;
 using cudaDeviceProp = hipDeviceProp_t;
 using cudaStream_t = hipStream_t;
 using cudaStreamCaptureMode = hipStreamCaptureMode;
+using cudaStreamCaptureStatus = hipStreamCaptureStatus;
 using cudaMemcpyKind = hipMemcpyKind;
 using cudaIpcMemHandle_t = hipIpcMemHandle_t;

@@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
 constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
 constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
+constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone;
+constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive;
+constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated;
 constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
 constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
 constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
@@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC
 #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
 #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
 #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
+#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__)
 #define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
 #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
 #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -68,16 +68,17 @@ void register_algorithm(nb::module_& m) {
              "execute",
              [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
                 size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
-                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
+                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
                 std::unordered_map<std::string, uintptr_t> extras) {
                return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
                                    inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
-                                    nBlocks, nThreadsPerBlock, extras);
+                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras);
              },
              nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
              nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
-              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
-              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
+              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
+          .def("reset", &Algorithm::reset);

  nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
      .def(nb::init<>())
@@ -108,8 +109,17 @@ void register_algorithm(nb::module_& m) {
      .def_prop_ro("output_buffer",
                   [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
      .def_ro("message_size", &CollectiveRequest::messageSize)
+      .def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.stream); })
      .def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
      .def_ro("dtype", &CollectiveRequest::dtype)
      .def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
      .def("buffer_mode", &CollectiveRequest::bufferMode);
+
+  m.def(
+      "cpp_get_default_flag_buffer",
+      []() {
+        auto [buffer, size] = getDefaultFlagBuffer();
+        return std::make_pair(reinterpret_cast<uintptr_t>(buffer.get()), size);
+      },
+      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size).");
 }
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -44,7 +44,9 @@ void register_core(nb::module_& m) {
      .value("uint32", DataType::UINT32)
      .value("float16", DataType::FLOAT16)
      .value("float32", DataType::FLOAT32)
-      .value("bfloat16", DataType::BFLOAT16);
+      .value("bfloat16", DataType::BFLOAT16)
+      .value("float8_e4m3", DataType::FP8_E4M3)
+      .value("float8_e5m2", DataType::FP8_E5M2);

  nb::class_<Bootstrap>(m, "CppBootstrap")
      .def("get_rank", &Bootstrap::getRank)
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -29,6 +29,6 @@ void register_algorithm_collection_builder(nb::module_& m) {
           nb::arg("selector"))
      .def("build", &AlgorithmCollectionBuilder::build)
      .def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
-           nb::arg("scratch_buffer_size"), nb::arg("rank"))
+           nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank"))
      .def_static("reset", &AlgorithmCollectionBuilder::reset);
 }
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 from typing import Optional, Tuple, Dict
 from functools import cached_property
+import cupy as cp


 from mscclpp._mscclpp import (
@@ -18,6 +19,7 @@ from mscclpp._mscclpp import (
    CppReduceOp,
    CppAlgorithmBuilder,
    CppAlgorithmCollection,
+    cpp_get_default_flag_buffer,
 )

 __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -160,6 +162,7 @@ class Algorithm:
        executor: Optional[CppExecutor] = None,
        nblocks=0,
        nthreads_per_block=0,
+        symmetric_memory: bool = False,
        extras: Optional[Dict[str, int]] = None,
    ) -> int:
        """Execute the collective algorithm.
@@ -176,6 +179,7 @@ class Algorithm:
            executor: The executor for DSL algorithms (required for DSL, optional for native).
            nblocks: Number of CUDA blocks (0 for auto-selection).
            nthreads_per_block: Number of threads per block (0 for auto-selection).
+            symmetric_memory: Whether to use symmetric memory optimization (default: False).
            extras: Additional algorithm-specific parameters.

        Returns:
@@ -193,9 +197,14 @@ class Algorithm:
            executor,
            nblocks,
            nthreads_per_block,
+            symmetric_memory,
            extras if extras is not None else {},
        )

+    def reset(self):
+        """Reset the internal state of the algorithm, if applicable."""
+        self._algorithm.reset()
+

 class AlgorithmBuilder:
    def __init__(self, algorithm_builder: CppAlgorithmBuilder):
@@ -230,3 +239,17 @@ class AlgorithmCollection:
        """Register an algorithm for a collective operation."""
        self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
        self._algorithms.append(algorithm)
+
+
+def get_default_flag_buffer() -> cp.ndarray:
+    """Get the default flag buffer for algorithm selection.
+
+    This buffer is used internally by default algorithms to store selection flags.
+    It is allocated as a shared GPU buffer and can be accessed from Python.
+
+    Returns:
+        A CuPy array representing the flag buffer on the GPU.
+    """
+    buffer_ptr, buffer_size = cpp_get_default_flag_buffer()
+    memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0)
+    return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -3,7 +3,7 @@

 from __future__ import annotations
 from typing import Union
-from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
+from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer
 import atexit

 from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
@@ -29,6 +29,7 @@ class AlgorithmCollectionBuilder:
        if not hasattr(self, "_initialized"):
            self._builder = CppAlgorithmCollectionBuilder.get_instance()
            self._initialized = True
+            self._flag_buffer = None

    def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
        if isinstance(algorithm_builder, AlgorithmBuilder):
@@ -50,8 +51,17 @@ class AlgorithmCollectionBuilder:
        collection = self._builder.build()
        return AlgorithmCollection(collection)

-    def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
-        native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
+    def build_default_algorithms(
+        self,
+        scratch_buffer: int,
+        scratch_buffer_size: int,
+        rank: int,
+    ) -> AlgorithmCollection:
+        if self._flag_buffer is None:
+            self._flag_buffer = get_default_flag_buffer()
+        native_collection = self._builder.build_default_algorithms(
+            int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
+        )
        return AlgorithmCollection(native_collection)


--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -192,5 +192,11 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
        return DataType.int32
    elif dtype == torch.bfloat16:
        return DataType.bfloat16
+    # Hardware supports either OCP format or FNUZ format for float8.
+    # Mapping both to the same MSCClPP data type.
+    elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz:
+        return DataType.float8_e5m2
+    elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz:
+        return DataType.float8_e4m3
    else:
        raise ValueError(f"Unknown data type: {dtype}")
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -3,6 +3,7 @@

 #include <filesystem>
 #include <mscclpp/algorithm.hpp>
+#include <mscclpp/gpu_utils.hpp>

 #include "logger.hpp"

@@ -40,12 +41,12 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
 CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
                                    size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
                                    cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
-                                    const std::unordered_map<std::string, uintptr_t>& extras) {
+                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
  if (!initialized_) {
    initFunc_(comm);
    initialized_ = true;
  }
-  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype);
+  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory);
  auto it = contexts_.find(ctxKey);
  if (it == contexts_.end()) {
    auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
@@ -155,7 +156,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }

 CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                                 size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
-                                 std::shared_ptr<Executor> executor, int, int,
+                                 std::shared_ptr<Executor> executor, int, int, bool,
                                 const std::unordered_map<std::string, uintptr_t>&) {
  if (!executor) {
    THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
@@ -198,4 +199,18 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
 // TODO: implement this
 void DslAlgorithm::reset() {}

+static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
+static size_t gDefaultFlagCount = 128;
+
+std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
+  std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
+  if (!flagBuffer) {
+    flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
+    std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
+    mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
+    gDefaultFlagBuffer = flagBuffer;
+  }
+  return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
+}
+
 }  // namespace mscclpp
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -64,7 +64,7 @@ Env::Env()
      cudaIpcUseDefaultStream(readEnv<bool>("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)),
      ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
      forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
-      disableChannelCache(readEnv<bool>("MSCCLPP_DISABLE_CHANNEL_CACHE", false)),
+      ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}

 std::shared_ptr<Env> env() {
@@ -91,7 +91,7 @@ std::shared_ptr<Env> env() {
    logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream);
    logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath);
    logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
-    logEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", globalEnv->disableChannelCache);
+    logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
    logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
  }
  return globalEnv;
--- a/src/ext/collectives/algorithm_collection_builder.cc
+++ b/src/ext/collectives/algorithm_collection_builder.cc
@@ -49,8 +49,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::build() {
 void AlgorithmCollectionBuilder::reset() { gAlgorithmCollectionBuilder_.reset(); }

 AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t scratchBuffer,
-                                                                       size_t scratchBufferSize, int rank) {
-  auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize);
+                                                                       size_t scratchBufferSize, uintptr_t flagBuffer,
+                                                                       size_t flagBufferSize, int rank) {
+  auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize);
  auto dslCollection = buildDefaultDslAlgorithms(rank);
  nativeCollection.extend(dslCollection);
  nativeCollection.setSelectors(algoSelector_, fallbackAlgoSelector_);
@@ -58,11 +59,15 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t
 }

 AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uintptr_t scratchBuffer,
-                                                                             size_t scratchBufferSize) {
+                                                                             size_t scratchBufferSize,
+                                                                             uintptr_t flagBuffer,
+                                                                             size_t flagBufferSize) {
  AlgorithmCollection collection;
-  auto allreduceAllpairPkt = std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreduceAllpairPkt =
+      std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
  collection.registerAlgorithm(allreduceAllpairPkt->collective(), allreduceAllpairPkt->name(), allreduceAllpairPkt);
-  auto allreduceNvlsPacket = std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreduceNvlsPacket =
+      std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
  collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket);
  auto allreduceNvlsWithCopy = std::make_shared<AllreduceNvlsWithCopy>(scratchBuffer, scratchBufferSize)->build();
  collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(),
@@ -70,7 +75,8 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uin
  auto allreduceNvlsWithCopy2 = std::make_shared<AllreduceNvlsWithCopy2>(scratchBuffer, scratchBufferSize)->build();
  collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(),
                               allreduceNvlsWithCopy2);
-  auto allreducePkt = std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreducePkt =
+      std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
  collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt);
  auto allreduceNvls = std::make_shared<AllreduceNvls>()->build();
  collection.registerAlgorithm(allreduceNvls->collective(), allreduceNvls->name(), allreduceNvls);
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -170,7 +170,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
  return ctx;
 }

-AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType, bool) {
  // always return same key, non-zero copy algo
  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
@@ -189,8 +189,9 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
             [[maybe_unused]] size_t outputSize,
             DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllgatherContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
      });
 }
 }  // namespace collective
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -107,12 +107,6 @@ __global__ void __launch_bounds__(1024, 1)
  }
 }

-AllgatherFullmesh2::AllgatherFullmesh2() : disableChannelCache_(false) {
-  if (mscclpp::env()->disableChannelCache) {
-    disableChannelCache_ = true;
-  }
-}
-
 void AllgatherFullmesh2::initialize(std::shared_ptr<Communicator> comm) {
  this->conns_ = setupConnections(comm);
  this->memorySemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
@@ -174,7 +168,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
  CUdeviceptr recvBasePtr;
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
  size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
-  if (disableChannelCache_) {
+  if (!symmetricMemory_) {
    channelOutOffset = 0;
    recvBytes = inputSize * comm->bootstrap()->getNranks();
    recvBasePtr = (CUdeviceptr)output;
@@ -197,10 +191,11 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
 }

 mscclpp::AlgorithmCtxKey AllgatherFullmesh2::generateAllgatherContextKey(const void*, void* output, size_t,
-                                                                         mscclpp::DataType) {
+                                                                         mscclpp::DataType, bool symmetricMemory) {
  static int tag = 0;
-  if (disableChannelCache_) {
-    // always return a new key if channel cache is disabled
+  symmetricMemory_ = symmetricMemory;
+  if (!symmetricMemory_) {
+    // always return a new key if symmetric memory is not enabled.
    return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, tag++};
  }
  size_t recvBytes;
@@ -224,7 +219,9 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
             [[maybe_unused]] size_t outputSize,
             mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllgatherContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }

 }  // namespace collective
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -11,29 +11,18 @@
 namespace mscclpp {
 namespace collective {

-__device__ uint32_t deviceFlag = 1;
-
-template <ReduceOp OpType, typename T, bool flagPerBlock = false>
+template <ReduceOp OpType, typename T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                                  size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
-                                  int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags) {
+                                  int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
+                                  uint32_t flagSize) {
  // This version of allreduce only works for single nodes
  if (worldSize != nRanksPerNode) return;

  if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
  const int nPeers = nRanksPerNode - 1;

-  uint32_t flag = 0;
-  if constexpr (flagPerBlock) {
-    flag = ((uint32_t*)flags)[blockIdx.x];
-  } else {
-    flag = deviceFlag;
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-    }
-  }
-
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
  size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0;
  size_t channelScratchOffset = scratchBaseOffset;

@@ -62,22 +51,12 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
    }
    dst[idx] = data;
  }
-  if constexpr (flagPerBlock) {
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((uint32_t*)flags)[blockIdx.x] = flag + 1;
-    }
-  } else {
-    // Make sure all threadblocks have finished reading before incrementing the flag
-    if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-      ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
-    }
-    if (blockIdx.x == 0) {
-      __syncthreads();
-    }
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-      deviceFlag++;
-    }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
+  }
+  if (blockIdx.x == 0 && threadIdx.x >= gridDim.x && threadIdx.x < flagSize / sizeof(uint32_t)) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
  }
 }

@@ -93,19 +72,13 @@ struct AllpairAdapter {
  static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                          DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
                          size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
-                          cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
+                          cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0,
                          int nThreadsPerBlock = 0) {
    using ChannelType = DeviceHandle<MemoryChannel>;
    const size_t nelems = inputSize / sizeof(T);
-    if (nBlocks == 7 || nBlocks == 28) {
-      allreduceAllPairs<OpType, T, true><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-          (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-          nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
-      return cudaGetLastError();
-    }
    allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
        (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
+        nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
    return cudaGetLastError();
  }
 };
@@ -116,12 +89,6 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
  RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
  registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
  registeredMemories_.push_back(scratchMemory);
-  flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
-  std::vector<uint32_t> flags(28, 1);
-  flags7_ = detail::gpuCallocShared<uint32_t>(7);
-  flags28_ = detail::gpuCallocShared<uint32_t>(28);
-  gpuMemcpy<uint32_t>(flags7_.get(), flags.data(), 7, cudaMemcpyHostToDevice);
-  gpuMemcpy<uint32_t>(flags28_.get(), flags.data(), 28, cudaMemcpyHostToDevice);
 }

 CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
@@ -133,13 +100,6 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
  if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
  }
-  void* flags = this->flags_.get();
-  if (blockAndThreadNum.first == 7) {
-    flags = this->flags7_.get();
-  } else if (blockAndThreadNum.first == 28) {
-    flags = this->flags28_.get();
-  }
-
  size_t sendBytes;
  CUdeviceptr sendBasePtr;
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -150,10 +110,11 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
    return CommResult::CommInvalidArgument;
  }
-  cudaError_t error = allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr,
-                                nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank,
-                                algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, flags,
-                                this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
+  cudaError_t error =
+      allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr,
+                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode,
+                algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
+                this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
  if (error != cudaSuccess) {
    WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
    return CommResult::CommUnhandledCudaError;
@@ -185,7 +146,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
  return ctx;
 }

-AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
  size_t sendBytes;
  CUdeviceptr sendBasePtr;
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -193,7 +154,8 @@ AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void*
 }

 std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
-  auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+  auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
+                                                       flagBuffer_, flagBufferSize_);
  return std::make_shared<NativeAlgorithm>(
      "default_allreduce_allpair_packet", "allreduce",
      [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
@@ -206,8 +168,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
             [[maybe_unused]] size_t outputSize,
             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
      });
 }
 }  // namespace collective
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -149,7 +149,8 @@ struct AllreduceAllconnectAdapter {
  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                          DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
                          size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize,
-                          size_t inputSize, cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks,
+                          int nThreadsPerBlock) {
    using ChannelType = DeviceHandle<MemoryChannel>;
    size_t nelems = inputSize / sizeof(T);
    if (nBlocks == 0) nBlocks = 35;
@@ -180,8 +181,11 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
  auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
  size_t recvBytes;
  CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
+  size_t channelOutOffset = 0;
+  if (symmetricMemory_) {
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+    channelOutOffset = (char*)output - (char*)recvBasePtr;
+  }
  std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles;
  if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
    inputChannelHandles = this->memoryChannelsMap_[input].second;
@@ -204,7 +208,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
  cudaError_t error =
      allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
                nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
-                stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
  if (error != cudaSuccess) {
    WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error));
    return CommResult::CommUnhandledCudaError;
@@ -212,19 +216,21 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
  return CommResult::CommSuccess;
 }

-AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType) {
+AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType,
+                                                               bool symmetricMemory) {
  static int tag = 0;
  size_t recvBytes;
  CUdeviceptr recvBasePtr;
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  if (env()->disableChannelCache) {
+  symmetricMemory_ = symmetricMemory;
+  if (!symmetricMemory_) {
    return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++};
  }
  return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
 }

 std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                              void* output, size_t, DataType) {
+                                                              void* output, size_t size, DataType) {
  auto ctx = std::make_shared<AlgorithmCtx>();
  ctx->rank = comm->bootstrap()->getRank();
  ctx->workSize = comm->bootstrap()->getNranks();
@@ -236,6 +242,10 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
  size_t recvBytes;
  CUdeviceptr recvBasePtr;
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+  if (!symmetricMemory_) {
+    recvBytes = size;
+    recvBasePtr = (CUdeviceptr)output;
+  }
  RegisteredMemory localMemory = comm->registerMemory((void*)recvBasePtr, recvBytes, Transport::CudaIpc);
  ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
  ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
@@ -258,8 +268,9 @@ std::shared_ptr<Algorithm> AllreduceFullmesh::build() {
      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
             [[maybe_unused]] size_t outputSize,
             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
      });
 }
 }  // namespace collective
--- a/src/ext/collectives/allreduce/allreduce_nvls.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls.cu
@@ -23,9 +23,18 @@ __global__ void __launch_bounds__(1024, 1)
  int nBlocks = gridDim.x;
  int bid = blockIdx.x;
  size_t sizePerRank = size / nRanksPerNode;
-  size_t sizePerBlock = sizePerRank / nBlocks;
+  const size_t minAlign = 16;
+  // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore
+  size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks;
+  sizePerBlock = (sizePerBlock + minAlign - 1) / minAlign * minAlign;
+
  size_t rankOffset = sizePerRank * rank;
  size_t blockOffset = sizePerBlock * bid + rankOffset;
+  size_t curBlockSize = 0;
+  if (sizePerBlock * bid < sizePerRank) {
+    curBlockSize = min(sizePerBlock, sizePerRank - sizePerBlock * bid);
+  }
+
  mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
  mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOutPtr = multicastOut + bid;

@@ -44,8 +53,10 @@ __global__ void __launch_bounds__(1024, 1)
  __syncthreads();
  T* src = (T*)multicastPtr->mcPtr;
  T* dst = (T*)multicastOutPtr->mcPtr;
-  handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock,
-                             threadIdx.x, blockDim.x);
+  if (curBlockSize > 0) {
+    handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize,
+                               threadIdx.x, blockDim.x);
+  }
  __syncthreads();
  if (threadIdx.x < nPeers) {
    channels[threadIdx.x].relaxedSignal();
@@ -60,7 +71,7 @@ struct NvlsAdapter {
                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
                          size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
 #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
      return cudaErrorNotSupported;
@@ -77,7 +88,12 @@ struct NvlsAdapter {
 };

 void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
-  nSwitchChannels_ = 8;
+  int device;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&device));
+  cudaDeviceProp deviceProp;
+  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
+  computeCapabilityMajor_ = deviceProp.major;
+  nSwitchChannels_ = 32;
  this->conns_ = setupConnections(comm);
  // setup semaphores
  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -91,6 +107,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
                                              size_t inputSize, mscclpp::DataType dtype, ReduceOp op,
                                              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
                                              const std::unordered_map<std::string, uintptr_t>&) {
+  if (!symmetricMemory_) {
+    WARN("AllreduceNvls requires symmetric memory for now.");
+    return CommResult::CommInvalidArgument;
+  }
  auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype);
  if (!allreduce) {
@@ -110,12 +130,16 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
  }
  std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
  if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
-    numBlocksAndThreads = {ctx->nRanksPerNode, 1024};
+    numBlocksAndThreads = {::min(ctx->nRanksPerNode, nSwitchChannels_), 1024};
+    // For GB200 devices, using more blocks to improve the performances when nRanksPerNode <= 8
+    if (computeCapabilityMajor_ == 10 && ctx->nRanksPerNode <= 8) {
+      numBlocksAndThreads.first = ::min(32, nSwitchChannels_);
+    }
  }
  cudaError_t error =
      allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
                nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize,
-                inputSize, stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
  if (error != cudaSuccess) {
    WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
    return CommResult::CommUnhandledCudaError;
@@ -124,7 +148,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
 }

 mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
-                                                                    mscclpp::DataType) {
+                                                                    mscclpp::DataType, bool symmetricMemory) {
+  symmetricMemory_ = symmetricMemory;
  size_t sendBytes, recvBytes;
  CUdeviceptr sendBasePtr, recvBasePtr;
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -174,7 +199,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
             [[maybe_unused]] size_t outputSize,
             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 }  // namespace collective
 }  // namespace mscclpp
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -9,25 +9,15 @@
 namespace mscclpp {
 namespace collective {

-__device__ uint32_t deviceFlag = 1;
-template <ReduceOp OpType, typename T, bool flagPerBlock = false>
+template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
    allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
                        [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
                        [[maybe_unused]] size_t nelems, [[maybe_unused]] size_t scratchBufferSize,
-                        [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags) {
+                        [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags,
+                        [[maybe_unused]] uint32_t flagBufferSize) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-  uint32_t flag = 0;
-  if constexpr (flagPerBlock) {
-    flag = ((uint32_t*)flags)[blockIdx.x];
-  } else {
-    flag = deviceFlag;
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-    }
-  }
-
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
  size_t scratchBaseOffset = (flag % 2) ? scratchBufferSize / 2 : 0;
  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
  uint32_t nPktPerRank = nelems / worldSize / (sizeof(mscclpp::LL8Packet::Payload) / sizeof(T));
@@ -51,21 +41,13 @@ __global__ void __launch_bounds__(1024, 1)
    }
    dst[i] = data;
  }
-  if constexpr (flagPerBlock) {
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((uint32_t*)flags)[blockIdx.x] = flag + 1;
-    }
-  } else {
-    if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-      ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
-    }
-    if (blockIdx.x == 0) {
-      __syncthreads();
-    }
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-      deviceFlag++;
-    }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
+  }
+  // update other flags in-case using different number of blocks in next launch
+  if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
  }
 #endif
 }
@@ -85,30 +67,17 @@ struct AllreduceNvlsPacketAdapter {
  static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
                          DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                          size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream,
-                          void* flags, uint32_t, int nBlocks, int nThreadsPerBlock) {
-    if (nBlocks == 4 || nBlocks == 8) {
-      allreduceNvlsPacket<OpType, T, true>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
-                                                     inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
-    } else {
-      allreduceNvlsPacket<OpType, T>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
-                                                     inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
-    }
+                          void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
+        worldSize, flags, flagBufferSize);
    return cudaGetLastError();
  }
 };

-void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {
-  std::vector<uint32_t> flags(8, 1);
-  flags_ = detail::gpuCallocShared<LL8Packet>(16);
-  flags4_ = detail::gpuCallocShared<uint32_t>(4);
-  flags8_ = detail::gpuCallocShared<uint32_t>(8);
-  gpuMemcpy<uint32_t>(flags4_.get(), flags.data(), 4, cudaMemcpyHostToDevice);
-  gpuMemcpy<uint32_t>(flags8_.get(), flags.data(), 8, cudaMemcpyHostToDevice);
-}
+void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {}

-AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }

@@ -146,16 +115,10 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
    return CommResult::CommInvalidArgument;
  }
-  void* flags = this->flags_.get();
-  if (blockAndThreadNum.first == 4) {
-    flags = this->flags4_.get();
-  } else if (blockAndThreadNum.first == 8) {
-    flags = this->flags8_.get();
-  }
  cudaError_t error =
      allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr,
-                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, flags,
-                0, blockAndThreadNum.first, blockAndThreadNum.second);
+                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
+                (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
  if (error != cudaSuccess) {
    WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
    return CommResult::CommUnhandledCudaError;
@@ -164,7 +127,8 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
 }

 std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
-  auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_, flagBuffer_,
+                                                    flagBufferSize_);
  return std::make_shared<mscclpp::NativeAlgorithm>(
      "default_allreduce_nvls_packet", "allreduce",
      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
@@ -178,7 +142,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
             [[maybe_unused]] size_t outputSize,
             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 }  // namespace collective
 }  // namespace mscclpp
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
@@ -113,7 +113,7 @@ struct NvlsWithCopyAdapter {
  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                          DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                          size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
      return cudaErrorNotSupported;
@@ -157,7 +157,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
  }
  cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
+                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                blockAndThreadNum.first, blockAndThreadNum.second);
  if (error != cudaSuccess) {
    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
@@ -166,7 +166,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
  return CommResult::CommSuccess;
 }

-AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }

@@ -199,8 +199,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy::build() {
      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
             [[maybe_unused]] size_t outputSize,
             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
      });
 }
 }  // namespace collective
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
@@ -150,7 +150,7 @@ struct NvlsWithCopy2Adapter {
  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                          DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                          size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
      return cudaErrorNotSupported;
@@ -194,7 +194,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
  }
  cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
+                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                blockAndThreadNum.first, blockAndThreadNum.second);
  if (error != cudaSuccess) {
    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
@@ -203,7 +203,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
  return CommResult::CommSuccess;
 }

-AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }

@@ -236,8 +236,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
             [[maybe_unused]] size_t outputSize,
             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
      });
 }

--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -11,13 +11,11 @@
 namespace mscclpp {
 namespace collective {

-__device__ uint32_t deviceFlag = 1;
-
 template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
    allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                    size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
-                    size_t nelems, void* flags, uint32_t numScratchBuff
+                    size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff
 #if defined(ENABLE_NPKIT)
                    ,
                    NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
@@ -60,11 +58,7 @@ __global__ void __launch_bounds__(1024, 1)
  const int nPeers = nRanksPerNode - 1;
  const size_t nPkts = nelems / 2;

-  uint32_t flag = deviceFlag;
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-  }
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
  size_t channelScratchOffset = (flag % numScratchBuff) ? scratchBufferSize / numScratchBuff : 0;

  int nelemsPerRank = nelems / worldSize;
@@ -129,15 +123,12 @@ __global__ void __launch_bounds__(1024, 1)
    result[idx].y = data.y;
  }

-  // Make sure all threadblocks have finished reading before incrementing the flag
-  if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-    ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
  }
-  if (blockIdx.x == 0) {
-    __syncthreads();
-  }
-  if (threadIdx.x == 0 && blockIdx.x == 0) {
-    deviceFlag++;
+  if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
  }
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \
    defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
@@ -156,20 +147,22 @@ struct PacketAdapter {
  static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                          DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
                          size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
-                          cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
-                          int nThreadsPerBlock = 0) {
+                          cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff,
+                          int nBlocks = 0, int nThreadsPerBlock = 0) {
    using ChannelType = DeviceHandle<MemoryChannel>;
    const size_t nelems = inputSize / sizeof(T);
+    // Optimize the number of blocks to be multiple of (worldSize - 1)
+    nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
 #if defined(ENABLE_NPKIT)
    size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
        (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
+        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
        NpKit::GetCpuTimestamp());
 #else
    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
        (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, numScratchBuff);
+        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
    return cudaGetLastError();
  }
@@ -215,7 +208,6 @@ void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
  RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
  registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
  registeredMemories_.push_back(scratchMemory);
-  flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
 }

 CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
@@ -233,7 +225,6 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
  size_t channelInOffset = (char*)input - (char*)sendBasePtr;

-  void* flags = this->flags_.get();
  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype);
  if (!allreduce) {
    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
@@ -242,7 +233,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
  cudaError_t error =
      allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr,
                channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
-                stream, flags, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
+                stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
+                blockAndThreadNum.first, blockAndThreadNum.second);
  if (error != cudaSuccess) {
    WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
    return CommResult::CommUnhandledCudaError;
@@ -274,7 +266,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
  return ctx;
 }

-AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
+AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
  size_t sendBytes;
  CUdeviceptr sendBasePtr;
  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -282,7 +274,8 @@ AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input,
 }

 std::shared_ptr<Algorithm> AllreducePacket::build() {
-  auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+  auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
+                                                flagBuffer_, flagBufferSize_);
  return std::make_shared<NativeAlgorithm>(
      "default_allreduce_packet", "allreduce", [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
      [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
@@ -294,8 +287,9 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
             [[maybe_unused]] size_t outputSize,
             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
      });
 }

--- a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
+++ b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
@@ -25,7 +25,7 @@ class AllgatherFullmesh : public AlgorithmBuilder {

  std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                             size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
+  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType, bool);

  void* scratchBuffer_;
  size_t scratchBufferSize_;
--- a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
+++ b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
@@ -11,11 +11,11 @@ namespace collective {

 class AllgatherFullmesh2 : public AlgorithmBuilder {
 public:
-  AllgatherFullmesh2();
+  AllgatherFullmesh2() = default;
  std::shared_ptr<Algorithm> build() override;

 private:
-  bool disableChannelCache_;
+  bool symmetricMemory_;
  std::vector<Connection> conns_;
  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
  const int nChannelsPerConnection_ = 35;
@@ -27,7 +27,7 @@ class AllgatherFullmesh2 : public AlgorithmBuilder {

  std::shared_ptr<void> initAllgatherContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                             DataType);
-  AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType, bool);
 };

 }  // namespace collective
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -9,8 +9,11 @@ namespace mscclpp {
 namespace collective {
 class AllreduceAllpairPacket : public AlgorithmBuilder {
 public:
-  AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
  std::shared_ptr<Algorithm> build() override;

 private:
@@ -21,7 +24,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {

  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                             DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);

  void* scratchBuffer_;
  size_t scratchBufferSize_;
@@ -30,9 +33,8 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
  std::vector<Connection> conns_;
  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
  std::vector<RegisteredMemory> registeredMemories_;
-  std::shared_ptr<LL8Packet> flags_;
-  std::shared_ptr<uint32_t> flags7_;
-  std::shared_ptr<uint32_t> flags28_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
 };
 }  // namespace collective
 }  // namespace mscclpp
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -20,7 +20,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {

  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                             DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
  void* scratchBuffer_;
  size_t scratchBufferSize_;
  std::shared_ptr<Communicator> comm_;
@@ -32,6 +32,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
  RegisteredMemory localScratchMemory_;
  std::unordered_map<const void*, std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>
      memoryChannelsMap_;
+  bool symmetricMemory_ = false;
 };
 }  // namespace collective
 }  // namespace mscclpp
--- a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls.hpp
@@ -12,6 +12,7 @@ class AllreduceNvls : public AlgorithmBuilder {
  std::shared_ptr<Algorithm> build() override;

 private:
+  bool symmetricMemory_ = false;
  void initialize(std::shared_ptr<Communicator> comm);
  CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                 DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
@@ -19,13 +20,14 @@ class AllreduceNvls : public AlgorithmBuilder {

  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                             DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);

  const size_t nvlsBufferSize_ = (1 << 30);
  uint32_t nSwitchChannels_;
  std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
  std::vector<BaseMemoryChannel> baseChannels_;
  std::vector<Connection> conns_;
+  int computeCapabilityMajor_{0};
 };

 }  // namespace collective
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
@@ -10,8 +10,11 @@ namespace mscclpp {
 namespace collective {
 class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
 public:
-  AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
  std::shared_ptr<mscclpp::Algorithm> build() override;

 private:
@@ -22,15 +25,14 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {

  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                             size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType);
+  mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType, bool);

  void* scratchBuffer_;
  size_t scratchBufferSize_;
  const size_t nvlsBufferSize_ = (1 << 30);
  const int maxBlockNum_ = 16;
-  std::shared_ptr<LL8Packet> flags_;
-  std::shared_ptr<uint32_t> flags4_;
-  std::shared_ptr<uint32_t> flags8_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
 };
 }  // namespace collective
 }  // namespace mscclpp
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
@@ -20,7 +20,7 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder {

  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                             DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);

  const size_t nvlsBufferSize_ = (1 << 30);
  void* scratchBuffer_;
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
@@ -23,7 +23,7 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {

  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                             DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);

  const size_t nvlsBufferSize_ = (1 << 30);
  void* scratchBuffer_;
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -9,8 +9,11 @@ namespace mscclpp {
 namespace collective {
 class AllreducePacket : public AlgorithmBuilder {
 public:
-  AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
  std::shared_ptr<Algorithm> build() override;

 private:
@@ -21,16 +24,17 @@ class AllreducePacket : public AlgorithmBuilder {

  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                             DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);

  void* scratchBuffer_;
  size_t scratchBufferSize_;
  const int nSegmentsForScratchBuffer_ = 2;
  const int maxBlockNum_ = 56;
  std::vector<Connection> conns_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
  std::vector<RegisteredMemory> registeredMemories_;
-  std::shared_ptr<LL8Packet> flags_;
 };
 }  // namespace collective
 }  // namespace mscclpp
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -75,7 +75,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
 using AllreduceFunc =
    std::function<cudaError_t(const void*, void*, void*, void*, void*, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
                              mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
-                              size_t, cudaStream_t, void*, uint32_t, int, int)>;
+                              size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>;

 template <template <ReduceOp, typename> class Adapter>
 AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -31,7 +31,6 @@ constexpr int NUM_SEMAPHORES = 64;
 constexpr int MAX_NRANKS_PER_NODE = 8;

 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
-static bool mscclppDisableChannelCache = env()->disableChannelCache;

 std::vector<RegisteredMemory> setupRemoteMemories(std::shared_ptr<Communicator> comm, int rank,
                                                  RegisteredMemory localMemory);
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -0,0 +1,172 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "algorithm_selector.hpp"
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/utils.hpp>
+
+#include "debug.h"
+
+namespace mscclpp {
+namespace nccl {
+
+static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, DataType dtype) {
+  bool nvlsSupported = config.nvlsSupported;
+  const bool isFp8 = dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2;
+
+  if (!isFp8) {
+    return nvlsSupported;
+  }
+
+  // FP8 handling
+#if !defined(__HIP_PLATFORM_AMD__)
+  // NVLS does not support FP8 on devices with compute capability < 10
+  if (config.computeCapability.first < 10) {
+    return false;
+  }
+#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
+  return true;
+#else
+  return false;
+#endif
+#else
+  return nvlsSupported;
+#endif
+}
+
+bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request) {
+  bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
+  bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
+  bool collectiveMatch = algo->collective() == request.collective;
+  bool bufferModeMatch = algo->bufferMode() == CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
+  size_t effectiveSize =
+      (request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
+  bool minSizeMatch = effectiveSize >= algo->messageRange().first;
+  bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
+  bool result =
+      worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
+  return result;
+}
+
+static std::shared_ptr<Algorithm> selectSingleNodeAllreduceBlackwell(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config) {
+  const size_t messageSize = request.messageSize;
+
+  const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
+
+  // Small messages always use NVLS packet algorithm
+  if (messageSize <= (1 << 15)) {  // <= 32KB
+    return algoMap.at("default_allreduce_nvls_packet");
+  }
+
+  if (!config.symmetricMemory) {
+    if (messageSize <= (1 << 21)) {  // <= 2MB
+      return algoMap.at("default_allreduce_packet");
+    }
+    return nullptr;
+  }
+
+  // Symmetric memory path: can use cached memory handles
+  const bool useNvlsWithZeroCopy = nvlsSupported && config.isCuMemMapAllocated;
+  if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {  // <= 64KB or <= 1MB
+    return algoMap.at("default_allreduce_packet");
+  }
+  if (useNvlsWithZeroCopy) {
+    return algoMap.at("default_allreduce_nvls");
+  }
+
+  INFO(MSCCLPP_NCCL, "No suitable kernel for Blackwell architecture, fallback to nccl/rccl");
+  return nullptr;
+}
+
+std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config) {
+  // Use Blackwell-specific selection for compute capability 10.x
+  if (config.computeCapability.first == 10) {
+    return selectSingleNodeAllreduceBlackwell(algoMap, request, config);
+  }
+
+  const size_t messageSize = request.messageSize;
+
+  // Determine NVLS availability based on data type and device capability
+  const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
+
+  const bool useNvlsWithZeroCopy = nvlsSupported && config.symmetricMemory && config.isCuMemMapAllocated;
+
+  // Very small messages: use allpair packet algorithm
+  if (messageSize <= (1 << 14)) {  // <= 16KB
+    return algoMap.at("default_allreduce_allpair_packet");
+  }
+  // Small messages with NVLS support
+  if (messageSize <= (1 << 15) && nvlsSupported) {  // <= 32KB
+    return algoMap.at("default_allreduce_nvls_packet");
+  }
+  // Medium messages: use packet algorithm
+  if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {  // <= 64KB or <= 1MB
+    return algoMap.at("default_allreduce_packet");
+  }
+  // Large messages with NVLS zero-copy support
+  if (nvlsSupported && useNvlsWithZeroCopy) {
+    return algoMap.at("default_allreduce_nvls");
+  }
+  // Large messages with NVLS but without zero-copy
+  if (nvlsSupported) {
+    if (messageSize < (1 << 24)) {  // < 16MB
+      return algoMap.at("default_allreduce_nvls_with_copy");
+    }
+    return algoMap.at("default_allreduce_nvls_with_copy2");
+  }
+#if defined(__HIP_PLATFORM_AMD__)
+  // AMD platform: use fullmesh algorithm
+  return algoMap.at("default_allreduce_fullmesh");
+#else
+  // NVIDIA without NVLS: use RSAG pipeline if no NCCL fallback
+  if (!config.ncclDlopenSharedLib) {
+    return algoMap.at("default_allreduce_fullmesh");
+  }
+  return nullptr;
+#endif
+}
+
+std::shared_ptr<Algorithm> selectSingleNodeAllgather(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    [[maybe_unused]] const AlgorithmSelectorConfig& config) {
+  const size_t messageSize = request.messageSize;
+
+  // For messages up to 32MB, use fullmesh2 algorithm
+  if (messageSize <= 32 * (1 << 20)) {
+    return algoMap.at("default_allgather_fullmesh2");
+  }
+
+#if defined(__HIP_PLATFORM_AMD__)
+  // AMD platform always uses fullmesh2
+  return algoMap.at("default_allgather_fullmesh2");
+#else
+  // NVIDIA: use fullmesh for large messages if no NCCL fallback is available
+  if (!config.ncclDlopenSharedLib) {
+    return algoMap.at("default_allgather_fullmesh");
+  }
+  return nullptr;
+#endif
+}
+
+std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap [[maybe_unused]],
+    const CollectiveRequest& request [[maybe_unused]], const AlgorithmSelectorConfig& config [[maybe_unused]]) {
+  // TODO: Implement multi-node algorithm selection
+  // Multi-node scenarios will need to consider:
+  // 1. Multi-node NVLS (if supported by hardware)
+  // 2. Multi-node IB (InfiniBand)
+  // 3. Hierarchical algorithms (intra-node + inter-node)
+  // 4. Network topology awareness
+
+  // For now, return nullptr to fallback to NCCL/RCCL
+  INFO(MSCCLPP_NCCL, "Multi-node collective not yet supported, fallback to nccl/rccl");
+  return nullptr;
+}
+
+}  // namespace nccl
+}  // namespace mscclpp
--- a/src/ext/nccl/algorithm_selector.hpp
+++ b/src/ext/nccl/algorithm_selector.hpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
+#define MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
+
+#include <memory>
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/core.hpp>
+#include <unordered_map>
+
+namespace mscclpp {
+namespace nccl {
+
+/// Configuration for algorithm selection
+struct AlgorithmSelectorConfig {
+  bool symmetricMemory;
+  bool nvlsSupported;
+  bool isCuMemMapAllocated;
+  bool inCaptureMode;
+  std::pair<int, int> computeCapability;
+  bool ncclDlopenSharedLib;
+};
+
+/// Select an algorithm for single-node allreduce
+std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Select an algorithm for single-node allgather
+std::shared_ptr<Algorithm> selectSingleNodeAllgather(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Select an algorithm for multi-node collective operations
+/// Currently returns nullptr to fallback to NCCL/RCCL
+/// TODO: Implement multi-node NVLS and multi-node IB algorithms
+std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Check if an execution plan matches the request
+bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request);
+
+}  // namespace nccl
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -9,6 +9,8 @@
 #include <cstddef>
 #include <mscclpp/gpu_data_types.hpp>

+#include "logger.hpp"
+
 // Convert ncclDataType_t to mscclpp::DataType
 inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
  switch (dtype) {
@@ -70,8 +72,8 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
      return ncclFloat8e5m2;
 #endif
    default:
-      assert(false && "Unsupported mscclpp::DataType");
-      return ncclNumTypes;
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "Unsupported mscclpp::DataType: " + std::to_string(static_cast<int>(dtype)));
  }
 }

--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <algorithm>
 #include <filesystem>
@@ -19,8 +19,10 @@

 #include <mscclpp/algorithm.hpp>

+#include "algorithm_selector.hpp"
 #include "datatype_conversion.hpp"
-#include "debug.h"
+
+static constexpr auto MSCCLPP_NCCL = mscclpp::LogSubsys::NCCL;

 #define NCCL_API extern "C" __attribute__((visibility("default")))

@@ -80,17 +82,17 @@ static inline int mscclppNcclDlopenInit() {
  const char* ncclLibPath = mscclpp::env()->ncclSharedLibPath.c_str();
  if (ncclLibPath != nullptr && ncclLibPath[0] != '\0') {
    if (std::filesystem::is_directory(ncclLibPath)) {
-      WARN("The value of the environment variable %s is a directory", ncclLibPath);
+      WARN(MSCCLPP_NCCL, "The value of the environment variable %s is a directory", ncclLibPath);
      return dlopenError;
    }

    mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE);
    if (!mscclppNcclDlHandle) {
-      WARN("Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
+      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
      return dlopenError;
    }
  } else {
-    WARN("The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
+    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
    return dlopenError;
  }

@@ -179,7 +181,7 @@ struct ncclComm {

 NCCL_API ncclResult_t ncclGetVersion(int* version) {
  if (version == nullptr) {
-    WARN("version is nullptr");
+    WARN(MSCCLPP_NCCL, "version is nullptr");
    return ncclInvalidArgument;
  }
  *version = MSCCLPP_VERSION;
@@ -188,7 +190,7 @@ NCCL_API ncclResult_t ncclGetVersion(int* version) {

 NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) {
  if (uniqueId == nullptr) {
-    WARN("uniqueId is nullptr");
+    WARN(MSCCLPP_NCCL, "uniqueId is nullptr");
    return ncclInvalidArgument;
  }
  if (mscclpp::UniqueIdBytes != NCCL_UNIQUE_ID_BYTES) return ncclInternalError;
@@ -212,21 +214,6 @@ static std::pair<int, int> getDeviceComputeCapability() {
  return std::make_pair(major, minor);
 }

-static bool matchExecutionPlan(std::shared_ptr<mscclpp::DslAlgorithm> algo, const mscclpp::CollectiveRequest& request) {
-  bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
-  bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
-  bool collectiveMatch = algo->collective() == request.collective;
-  bool bufferModeMatch =
-      algo->bufferMode() == mscclpp::CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
-  size_t effectiveSize =
-      (request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
-  bool minSizeMatch = effectiveSize >= algo->messageRange().first;
-  bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
-  bool result =
-      worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
-  return result;
-}
-
 static std::shared_ptr<mscclpp::Algorithm> algoSelector(
    const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<mscclpp::Algorithm>>>&
        algoMapByCollective,
@@ -234,86 +221,64 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
  if (algoMapByCollective.find(request.collective) == algoMapByCollective.end()) {
    return nullptr;
  }
+
  for (const auto& pair : algoMapByCollective.at(request.collective)) {
    const auto& algo = pair.second;
    if (algo->type() == mscclpp::AlgorithmType::DSL) {
-      if (matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
+      if (mscclpp::nccl::matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
        return algo;
      }
    }
  }
-  if (request.nRanksPerNode != request.worldSize) {
-    // Fallback to nccl/rccl when multi-node
-    return nullptr;
-  }
-  static const bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache;
+
+  // Prepare algorithm selector configuration
  static const bool isNvlsSupported = mscclpp::isNvlsSupported();
  static const std::pair<int, int> deviceComputeCapability = getDeviceComputeCapability();
-  size_t messageSize = request.messageSize;
-  const std::string& collective = request.collective;
-  bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
-                             mscclpp::isCuMemMapAllocated(request.outputBuffer);
-  bool useNvlsWithZeroCopy = isNvlsSupported && !mscclppDisableChannelCache && isCuMemMapAllocated;
-  if (collective == "allgather") {
-    if (messageSize <= 32 * (1 << 20)) {
-      return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
-    } else {
-#if defined(__HIP_PLATFORM_AMD__)
-      return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
-#else
-      if (!mscclppNcclDlopenSharedLib) {
-        return algoMapByCollective.at(collective).at("default_allgather_fullmesh");
-      }
-#endif
-    }
+  static const bool ncclSymmetricMemory = mscclpp::env()->ncclSymmetricMemory;
+
+  const bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
+                                   mscclpp::isCuMemMapAllocated(request.outputBuffer);
+
+  cudaStreamCaptureStatus captureStatus = cudaStreamCaptureStatusNone;
+  CUDACHECK(cudaStreamIsCapturing(request.stream, &captureStatus));
+  const bool inCaptureMode = (captureStatus == cudaStreamCaptureStatusActive);
+
+  mscclpp::nccl::AlgorithmSelectorConfig config{.symmetricMemory = ncclSymmetricMemory,
+                                                .nvlsSupported = isNvlsSupported,
+                                                .isCuMemMapAllocated = isCuMemMapAllocated,
+                                                .inCaptureMode = inCaptureMode,
+                                                .computeCapability = deviceComputeCapability,
+                                                .ncclDlopenSharedLib = mscclppNcclDlopenSharedLib};
+
+  const auto& algoMap = algoMapByCollective.at(request.collective);
+
+  // Check if this is a multi-node scenario
+  if (request.nRanksPerNode != request.worldSize) {
+    return mscclpp::nccl::selectMultiNodeAlgorithm(algoMap, request, config);
  }
-  if (collective == "allreduce") {
-    bool useNvls = isNvlsSupported;
-    bool isFp8 = request.dtype == mscclpp::DataType::FP8_E4M3 || request.dtype == mscclpp::DataType::FP8_E5M2;
-#if !defined(__HIP_PLATFORM_AMD__)
-    if (isFp8 && deviceComputeCapability.first < 10) {
-      // NVLS does not support FP8 on devices with compute capability < 10
-      useNvls = false;
-    }
-#endif
-    if (messageSize <= (1 << 15) && useNvls) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_packet");
-    }
-    if (messageSize <= (1 << 14)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");
-    }
-    if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_packet");
-    }
-    if (useNvls && useNvlsWithZeroCopy) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls");
-    }
-    if (useNvls && messageSize < (1 << 24)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy");
-    }
-    if (useNvls && messageSize >= (1 << 24)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy2");
-    }
-#if defined(__HIP_PLATFORM_AMD__)
-    return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
-#else
-    if (!mscclppNcclDlopenSharedLib) {
-      return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
-    }
-#endif
+
+  // Single-node scenarios
+  if (request.collective == "allgather") {
+    return mscclpp::nccl::selectSingleNodeAllgather(algoMap, request, config);
  }
-  INFO(MSCCLPP_NCCL, "Failed to get algo from customized kernel, fallback to nccl/rccl");
+
+  if (request.collective == "allreduce") {
+    return mscclpp::nccl::selectSingleNodeAllreduce(algoMap, request, config);
+  }
+
+  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '%s', fallback to nccl/rccl",
+       request.collective.c_str());
  return nullptr;
 }

 NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
  if (comm == nullptr) {
-    WARN("comm is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
    return ncclInvalidArgument;
  }
  if (nranks < 0 || rank < 0 || rank >= nranks) {
-    WARN("nranks is %d, rank is %d", nranks, rank);
+    WARN(MSCCLPP_NCCL, "nranks is %d, rank is %d", nranks, rank);
    return ncclInvalidArgument;
  }
  std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
@@ -327,12 +292,15 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
  commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
  commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);

+  auto [flagBuffer, flagBufferSize] = mscclpp::getDefaultFlagBuffer();
+
  commPtr->nRanksPerNode = mscclppComm->bootstrap()->getNranksPerNode();
  commPtr->worldSize = mscclppComm->bootstrap()->getNranks();
  auto algoBuilder = mscclpp::collective::AlgorithmCollectionBuilder::getInstance();
  algoBuilder->setFallbackAlgorithmSelector(algoSelector);
  commPtr->algorithmCollection = algoBuilder->buildDefaultAlgorithms(
-      reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_, rank);
+      reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_,
+      reinterpret_cast<uintptr_t>(flagBuffer.get()), flagBufferSize, rank);
  // Extend with user-defined algorithms
  commPtr->algorithmCollection.extend(algoBuilder->build());

@@ -346,7 +314,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
  const std::string ncclLibPath = mscclpp::env()->ncclSharedLibPath;
  if (!ncclLibPath.empty() && !mscclppNcclDlopenSharedLib) {
    if (!tryLoadNcclSharedLib()) {
-      WARN("Failed to load the shared library for nccl/rccl");
+      WARN(MSCCLPP_NCCL, "Failed to load the shared library for nccl/rccl");
      return ncclInternalError;
    }
  }
@@ -361,7 +329,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI

    commPtr->mscclppNcclComm = new ncclComm_t();
    if (commPtr->mscclppNcclComm == nullptr) {
-      WARN("Failed to allocate memory for mscclppNcclComm");
+      WARN(MSCCLPP_NCCL, "Failed to allocate memory for mscclppNcclComm");
      return ncclInternalError;
    }
    mscclppNcclOps.CommInitRank(reinterpret_cast<ncclComm_t*>(commPtr->mscclppNcclComm), nranks, mscclppNcclUniqueId,
@@ -378,7 +346,7 @@ NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int*) {
    return ncclCommInitRank(comm, ndev, Id, 0);
  }
  // TODO: implement this function
-  WARN("ncclCommInitAll is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommInitAll is currently unavailable");
  return ncclInternalError;
 }

@@ -389,7 +357,7 @@ NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) {

 NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
  if (comm == nullptr) {
-    WARN("comm is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
    return ncclInvalidArgument;
  }
 #if defined(ENABLE_NPKIT)
@@ -447,7 +415,7 @@ NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclCom
 }

 ncclResult_t ncclCommInitRankScalable(ncclComm_t*, int, int, int, ncclUniqueId*, ncclConfig_t*) {
-  WARN("ncclCommInitRankScalable is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommInitRankScalable is currently unavailable");
  return ncclInternalError;
 }

@@ -481,7 +449,7 @@ NCCL_API const char* ncclGetLastError(ncclComm_t) {

 NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError) {
  if (asyncError == nullptr) {
-    WARN("asyncError is nullptr");
+    WARN(MSCCLPP_NCCL, "asyncError is nullptr");
    return ncclInvalidArgument;
  }
  *asyncError = ncclSuccess;
@@ -490,7 +458,7 @@ NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError

 NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
  if (comm == nullptr || count == nullptr) {
-    WARN("comm is nullptr or count is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or count is nullptr");
    return ncclInvalidArgument;
  }
  *count = comm->comm->bootstrap()->getNranks();
@@ -499,7 +467,7 @@ NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {

 NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
  if (comm == nullptr || device == nullptr) {
-    WARN("comm is nullptr or device is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or device is nullptr");
    return ncclInvalidArgument;
  }
  *device = comm->comm->bootstrap()->getRank();
@@ -508,7 +476,7 @@ NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {

 NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
  if (comm == nullptr || rank == nullptr) {
-    WARN("comm is nullptr or rank is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or rank is nullptr");
    return ncclInvalidArgument;
  }

@@ -521,24 +489,24 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
 }

 NCCL_API ncclResult_t ncclCommWindowRegister(ncclComm_t, void*, size_t, ncclWindow_t*, int) {
-  WARN("ncclCommWindowRegister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommWindowRegister is currently unavailable");
  return ncclInternalError;
 }

 NCCL_API ncclResult_t ncclCommWindowDeregister(ncclComm_t, ncclWindow_t) {
-  WARN("ncclCommWindowDeregister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommWindowDeregister is currently unavailable");
  return ncclInternalError;
 }

 NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, ncclScalarResidence_t, ncclComm_t) {
  // TODO: implement this function
-  WARN("ncclRedOpCreatePreMulSum is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRedOpCreatePreMulSum is currently unavailable");
  return ncclInternalError;
 }

 NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
  // TODO: implement this function
-  WARN("ncclRedOpDestroy is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRedOpDestroy is currently unavailable");
  return ncclInternalError;
 }

@@ -549,7 +517,7 @@ NCCL_API ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t co
    return mscclppNcclOps.Reduce(sendbuff, recvbuff, count, datatype, op, root,
                                 *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
  }
-  WARN("ncclReduce is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclReduce is currently unavailable");
  return ncclInternalError;
 }

@@ -569,9 +537,9 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
  }
  int rank = comm->comm->bootstrap()->getRank();
  if ((sendbuff == nullptr && root == rank) || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
    return ncclInvalidArgument;
  }

@@ -587,12 +555,15 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
  mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
  static std::unordered_map<std::string, std::vector<uint64_t>> hints{{"root", {static_cast<uint64_t>(root)}}};
  hints["root"][0] = static_cast<uint64_t>(root);
+
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
  mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                        .nRanksPerNode = comm->nRanksPerNode,
                                        .rank = rank,
                                        .inputBuffer = sendbuff,
                                        .outputBuffer = recvbuff,
                                        .messageSize = bytes,
+                                        .stream = stream,
                                        .collective = "broadcast",
                                        .dtype = dtype,
                                        .hints = hints};
@@ -600,7 +571,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
  if (algo != nullptr) {
    std::unordered_map<std::string, uintptr_t> extras{{"root", reinterpret_cast<uintptr_t>(&root)}};
    return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
-                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0, extras));
+                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
+                                                   symmetricMemory, extras));
  }

  if (mscclppNcclDlopenSharedLib == true) {
@@ -608,7 +580,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
                                    *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
  }

-  WARN("No FallBack implementation for broadcast");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for broadcast");
  return ncclInvalidUsage;
 }

@@ -623,9 +595,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
  }
  // Checking if the parameters are valids
  if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
-        "datatype is invalid, or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
+         "datatype is invalid, or comm is nullptr.");
    return ncclInvalidArgument;
  }
  // Declarating variables
@@ -639,12 +611,14 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
                                    *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
  }
  mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
  mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                        .nRanksPerNode = comm->nRanksPerNode,
                                        .rank = rank,
                                        .inputBuffer = sendbuff,
                                        .outputBuffer = recvbuff,
                                        .messageSize = bytes,
+                                        .stream = stream,
                                        .collective = "allreduce",
                                        .dtype = dtype,
                                        .hints = {}};
@@ -652,7 +626,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
  auto algo = comm->algorithmCollection.selectAlgorithm(request);
  if (algo != nullptr) {
    return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
-                                                   ncclRedOpToMscclpp(reductionOperation), stream, comm->executor));
+                                                   ncclRedOpToMscclpp(reductionOperation), stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
  }

  if (mscclppNcclDlopenSharedLib == true) {
@@ -660,7 +635,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
                                    *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
  }

-  WARN("No FallBack implementation for AllReduce");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for AllReduce");
  return ncclInvalidUsage;
 }

@@ -675,9 +650,9 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
  }

  if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
    return ncclInvalidArgument;
  }

@@ -693,19 +668,22 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
  int rank = comm->comm->bootstrap()->getRank();
  int nRank = comm->comm->bootstrap()->getNranks();
  mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
  mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                        .nRanksPerNode = comm->nRanksPerNode,
                                        .rank = rank,
                                        .inputBuffer = sendbuff,
                                        .outputBuffer = recvbuff,
                                        .messageSize = bytes * nRank,
+                                        .stream = stream,
                                        .collective = "reducescatter",
                                        .dtype = dtype,
                                        .hints = {}};
  auto algo = comm->algorithmCollection.selectAlgorithm(request);
  if (algo != nullptr) {
    return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes * nRank, bytes, dtype,
-                                                   ncclRedOpToMscclpp(op), stream, comm->executor));
+                                                   ncclRedOpToMscclpp(op), stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
  }

  if (mscclppNcclDlopenSharedLib == true) {
@@ -713,7 +691,7 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
                                        *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
  }

-  WARN("No FallBack implementation for ReduceScatter");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for ReduceScatter");
  return ncclInternalError;
 }

@@ -727,9 +705,9 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
    return ncclSuccess;
  }
  if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
    return ncclInvalidArgument;
  }

@@ -745,12 +723,14 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
  }

  mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
  mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                        .nRanksPerNode = comm->nRanksPerNode,
                                        .rank = rank,
                                        .inputBuffer = sendbuff,
                                        .outputBuffer = recvbuff,
                                        .messageSize = bytes,
+                                        .stream = stream,
                                        .collective = "allgather",
                                        .dtype = dtype,
                                        .hints = {}};
@@ -758,7 +738,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
  auto algo = comm->algorithmCollection.selectAlgorithm(request);
  if (algo != nullptr) {
    return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes * nRank, dtype,
-                                                   mscclpp::ReduceOp::NOP, stream, comm->executor));
+                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
  }

  if (mscclppNcclDlopenSharedLib == true) {
@@ -766,7 +747,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
                                    *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
  }

-  WARN("No FallBack implementation for AllGather");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for AllGather");
  return ncclInvalidUsage;
 }

@@ -776,7 +757,7 @@ NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_
    return mscclppNcclOps.Send(sendbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
                               stream);
  }
-  WARN("ncclSend is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclSend is currently unavailable");
  return ncclInternalError;
 }

@@ -786,7 +767,7 @@ NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t data
    return mscclppNcclOps.Recv(recvbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
                               stream);
  }
-  WARN("ncclRecv is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRecv is currently unavailable");
  return ncclInternalError;
 }

@@ -800,7 +781,7 @@ NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t
    return ncclSuccess;
  }
  // TODO: implement this function
-  WARN("ncclAllToAll is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclAllToAll is currently unavailable");
  return ncclInternalError;
 }

@@ -815,7 +796,7 @@ NCCL_API ncclResult_t ncclAllToAllv(const void* sendbuff, [[maybe_unused]] const
                                      cudaMemcpyDeviceToDevice, stream));
    return ncclSuccess;
  }
-  WARN("ncclAllToAllv is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclAllToAllv is currently unavailable");
  return ncclInternalError;
 }

@@ -824,7 +805,7 @@ NCCL_API ncclResult_t ncclGroupStart() {
  if (mscclppNcclDlopenSharedLib == true) {
    return mscclppNcclOps.GroupStart();
  }
-  WARN("ncclGroupStart is currently unavailable, return success");
+  WARN(MSCCLPP_NCCL, "ncclGroupStart is currently unavailable, return success");
  return ncclSuccess;
 }

@@ -832,56 +813,56 @@ NCCL_API ncclResult_t ncclGroupEnd() {
  if (mscclppNcclDlopenSharedLib == true) {
    return mscclppNcclOps.GroupEnd();
  }
-  WARN("ncclGroupEnd is currently unavailable, return success");
+  WARN(MSCCLPP_NCCL, "ncclGroupEnd is currently unavailable, return success");
  return ncclSuccess;
 }

 NCCL_API ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t*) {
  // TODO: implement this function
-  WARN("ncclGroupSimulateEnd is not implemented");
+  WARN(MSCCLPP_NCCL, "ncclGroupSimulateEnd is not implemented");
  return ncclInternalError;
 }

 NCCL_API ncclResult_t ncclCommRegister(const ncclComm_t, void*, size_t, void**) {
  // TODO: Implementation
-  WARN("ncclCommRegister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommRegister is currently unavailable");
  return ncclInternalError;
 }

 NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
  // TODO: Implementation
-  WARN("ncclCommDeregister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommDeregister is currently unavailable");
  return ncclInternalError;
 }

 ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
  if (ptr == nullptr || size == 0) {
-    WARN("ptr is nullptr or size is 0");
+    WARN(MSCCLPP_NCCL, "ptr is nullptr or size is 0");
    return ncclInvalidArgument;
  }
  std::shared_ptr<char> sharedPtr;
  try {
    sharedPtr = mscclpp::GpuBuffer(size).memory();
    if (sharedPtr == nullptr) {
-      WARN("Failed to allocate memory via ncclMemAlloc");
+      WARN(MSCCLPP_NCCL, "Failed to allocate memory via ncclMemAlloc");
      return ncclSystemError;
    }
  } catch (const mscclpp::Error& e) {
    if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
-      WARN("Invalid usage: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Invalid usage: %s", e.what());
      return ncclInvalidUsage;
    } else {
-      WARN("Internal error: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Internal error: %s", e.what());
      return ncclInternalError;
    }
  } catch (const mscclpp::CudaError& e) {
-    WARN("Cuda error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cuda error: %s", e.what());
    return ncclUnhandledCudaError;
  } catch (const mscclpp::CuError& e) {
-    WARN("Cu error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cu error: %s", e.what());
    return ncclUnhandledCudaError;
  } catch (const mscclpp::BaseError& e) {
-    WARN("Base error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Base error: %s", e.what());
    return ncclInternalError;
  }
  ptrMap[sharedPtr.get()] = sharedPtr;
@@ -899,6 +880,6 @@ ncclResult_t ncclMemFree(void* ptr) {
  }

  // Pointer not found
-  WARN("Pointer not found");
+  WARN(MSCCLPP_NCCL, "Pointer not found");
  return ncclInvalidUsage;
 }
--- a/test/torch/allreduce_temp_buff.py
+++ b/test/torch/allreduce_temp_buff.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.

 # run with:
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  MSCCLPP_DISABLE_CHANNEL_CACHE=true  torchrun --nproc_per_node=8 ./allreduce_temp_buff.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  MSCCLPP_NCCL_SYMMETRIC_MEMORY=false  torchrun --nproc_per_node=8 ./allreduce_temp_buff.py

 import os
 import torch
--- a/test/torch/memory_report.py
+++ b/test/torch/memory_report.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_DISABLE_CHANNEL_CACHE=true  torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_NCCL_SYMMETRIC_MEMORY=false  torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
 import os, sys
 import torch
 import torch.distributed as dist