Merge branch 'main' into chhwang/fix-ib-no-atomic

This commit is contained in:
Changho Hwang
2026-02-23 19:14:13 -08:00
committed by GitHub
20 changed files with 292 additions and 46 deletions

View File

@@ -44,7 +44,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -19,7 +19,7 @@ steps:
targetType: inline
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -27,7 +27,7 @@ steps:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'

View File

@@ -16,7 +16,7 @@ steps:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -63,7 +63,7 @@ steps:
set -e; \
cd /root/mscclpp; \
mkdir -p build && cd build; \
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
make -j"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -20,9 +20,9 @@ steps:
script: |
mkdir build && cd build
if [ "${{ parameters.platform }}" == "rocm" ]; then
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
else
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
fi
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
# Options
option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
option(MSCCLPP_BUILD_TESTS "Build tests" ON)
option(MSCCLPP_BUILD_TESTS "Build tests" OFF)
option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
if(MSCCLPP_BYPASS_GPU_CHECK)
@@ -185,7 +186,10 @@ if(MSCCLPP_USE_GDRCOPY)
endif()
include(FetchContent)
FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
FetchContent_Declare(json
GIT_REPOSITORY https://github.com/nlohmann/json.git
GIT_TAG v3.12.0
)
FetchContent_MakeAvailable(json)
if("${INSTALL_PREFIX}" STREQUAL "")

View File

@@ -0,0 +1,15 @@
CUDA_HOME ?= /usr/local/cuda
COMPILER := $(CUDA_HOME)/bin/nvcc
ARCH_FLAG := -arch=native
TARGET = bidir_switch_channel
SRC = bidir_switch_channel.cu
all: $(TARGET)
$(TARGET): $(SRC)
$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
clean:
rm -f $(TARGET)

View File

@@ -0,0 +1,177 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <sys/wait.h>
#include <unistd.h>
#include <functional>
#include <iostream>
#include <mscclpp/concurrency_device.hpp>
#include <mscclpp/core.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/switch_channel.hpp>
#include <mscclpp/switch_channel_device.hpp>
#include <sstream>
#define PORT_NUMBER "50505"
template <typename... Args>
void log(Args &&...args) {
std::stringstream ss;
(ss << ... << args);
ss << std::endl;
std::cout << ss.str();
}
int spawn_process(std::function<void()> func) {
pid_t pid = fork();
if (pid < 0) return -1;
if (pid == 0) {
// Child process
func();
exit(0);
}
return pid;
}
int wait_process(int pid) {
int status;
if (waitpid(pid, &status, 0) < 0) {
return -1;
}
if (WIFEXITED(status)) {
return WEXITSTATUS(status);
}
return -1;
}
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
__device__ mscclpp::DeviceSyncer devSyncer;
__global__ void kernelSwitchReduce(int rank, int numElements) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
// rank 0 performs on first half of data and rank 1 on second half
int min = rank * (numElements / 2);
int max = (rank + 1) * (numElements / 2);
for (int i = tid + min; i < max; i += stride) {
auto val = gConstSwitchChan.reduce<mscclpp::f32x1>(i);
gConstSwitchChan.broadcast(i, val);
}
}
void worker(int myRank, int gpuId, const std::string &ipPort) {
MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
const int nRanks = 2;
const int iter = 1000;
const size_t bufferBytes = 128 * 1024 * 1024;
log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
// Build a connection and a semaphore
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
bootstrap->initialize(ipPort);
std::shared_ptr<mscclpp::Communicator> comm = std::make_shared<mscclpp::Communicator>(bootstrap);
std::vector<int> ranks;
ranks.reserve(nRanks);
for (int i = 0; i < nRanks; i++) ranks.push_back(i);
auto buffer = mscclpp::GpuBuffer<float>(bufferBytes);
auto nvlsConnection = mscclpp::connectNvlsCollective(comm, ranks, bufferBytes);
auto switchChannel = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer.data()), bufferBytes);
auto deviceHandle = switchChannel.deviceHandle();
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan, &deviceHandle, sizeof(deviceHandle)));
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
// Call the kernel in a loop for perf evaluation
for (size_t numElements : {1024, 1024 * 1024, 32 * 1024 * 1024}) {
cudaEvent_t start, end;
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
}
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
bootstrap->barrier();
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventRecord(start, 0));
}
for (int i = 0; i < iter; ++i) {
kernelSwitchReduce<<<256, 1024>>>(myRank, numElements);
}
MSCCLPP_CUDATHROW(cudaGetLastError());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
comm->bootstrap()->barrier();
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventRecord(end, 0));
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
float elapsedTime;
float elapsedTimePerIter;
float gbps;
MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
elapsedTimePerIter = elapsedTime / iter;
float dataSize = numElements * 4;
gbps = dataSize / elapsedTimePerIter * 1e-6f;
log("Rank ", myRank, " (GPU ", gpuId, "): bytes ", dataSize, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
gbps, " GB/s");
}
}
}
int main(int argc, char **argv) {
if (argc == 1) {
int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
if (pid0 < 0 || pid1 < 0) {
log("Failed to spawn processes.");
return -1;
}
int status0 = wait_process(pid0);
int status1 = wait_process(pid1);
if (status0 < 0 || status1 < 0) {
log("Failed to wait for processes.");
return -1;
}
if (status0 != 0 || status1 != 0) {
log("One of the processes failed.");
return -1;
}
log("Succeed!");
return 0;
} else if (argc == 4) {
std::string ipPort = argv[1];
int rank, gpuId;
try {
rank = std::stoi(argv[2]);
gpuId = std::stoi(argv[3]);
} catch (const std::exception &) {
log("Error: rank and gpu_id must be valid integers.");
return -1;
}
if (rank < 0 || rank > 2 || gpuId < 0) {
log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
return -1;
}
worker(rank, gpuId, ipPort);
log("Rank ", rank, ": Succeed!");
return 0;
} else {
std::cerr << "Usage:\n"
<< " " << argv[0] << " Run in intra-node mode\n"
<< " " << argv[0] << " <ip_port> <rank> <gpu_id> Run in inter-node mode\n";
return -1;
}
}

View File

@@ -366,7 +366,7 @@ class AlgorithmCollection {
/// Get a default GPU flag buffer (allocated once and reused).
/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer();
std::pair<std::shared_ptr<void>, size_t> getFlagBuffer();
} // namespace mscclpp

View File

@@ -4,6 +4,10 @@
add_subdirectory(csrc)
add_subdirectory(test)
target_compile_definitions(mscclpp_py PRIVATE
$<$<BOOL:${MSCCLPP_DISABLE_NB_LEAK_WARNINGS}>:MSCCLPP_DISABLE_NB_LEAK_WARNINGS>
)
add_custom_target(pytest_lib_copy ALL
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
@@ -12,4 +16,4 @@ add_custom_target(pytest_lib_copy ALL
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
DEPENDS mscclpp_py mscclpp_py_test
)
)

View File

@@ -116,10 +116,15 @@ void register_algorithm(nb::module_& m) {
.def("buffer_mode", &CollectiveRequest::bufferMode);
m.def(
"cpp_get_default_flag_buffer",
"cpp_get_flag_buffer",
[]() {
auto [buffer, size] = getDefaultFlagBuffer();
return std::make_pair(reinterpret_cast<uintptr_t>(buffer.get()), size);
auto [buffer, size] = getFlagBuffer();
uintptr_t ptr = reinterpret_cast<uintptr_t>(buffer.get());
// Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime.
auto prevent = std::make_unique<std::shared_ptr<void>>(std::move(buffer));
nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast<std::shared_ptr<void>*>(p); });
prevent.release(); // capsule now owns the pointer
return nb::make_tuple(ptr, size, owner);
},
"Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size).");
"Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner).");
}

View File

@@ -307,6 +307,9 @@ void register_core(nb::module_& m) {
}
NB_MODULE(_mscclpp, m) {
#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
nb::set_leak_warnings(false);
#endif
register_env(m);
register_error(m);
register_port_channel(m);
@@ -324,4 +327,4 @@ NB_MODULE(_mscclpp, m) {
// ext
register_algorithm_collection_builder(m);
}
}

View File

@@ -19,7 +19,7 @@ from mscclpp._mscclpp import (
CppReduceOp,
CppAlgorithmBuilder,
CppAlgorithmCollection,
cpp_get_default_flag_buffer,
cpp_get_flag_buffer,
)
__all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -241,15 +241,22 @@ class AlgorithmCollection:
self._algorithms.append(algorithm)
def get_default_flag_buffer() -> cp.ndarray:
_flag_buffer_cache = None
def get_flag_buffer() -> cp.ndarray:
"""Get the default flag buffer for algorithm selection.
This buffer is used internally by default algorithms to store selection flags.
It is allocated as a shared GPU buffer and can be accessed from Python.
The result is cached so all callers share the same buffer.
Returns:
A CuPy array representing the flag buffer on the GPU.
"""
buffer_ptr, buffer_size = cpp_get_default_flag_buffer()
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0)
return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
global _flag_buffer_cache
if _flag_buffer_cache is None:
buffer_ptr, buffer_size, owner = cpp_get_flag_buffer()
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0)
_flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
return _flag_buffer_cache

View File

@@ -19,8 +19,8 @@ from mscclpp._mscclpp import (
CppTransport,
CppTransportFlags,
)
import mpi4py
import numpy as np
import pickle
from mscclpp.utils import is_torch_tensor
@@ -29,20 +29,40 @@ __all__ = ["CommGroup"]
class CommGroup:
def __init__(
self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
self,
mpi_comm: "mpi4py.MPI.Comm" = None,
torch_group: "dist.ProcessGroup" = None,
interfaceIpPortTrio: str = "",
rank: int = None,
size: int = None,
):
if interfaceIpPortTrio == "":
self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
uniq_id = None
if mpi_comm.rank == 0:
# similar to NCCL's unique id
rank, size = (
(mpi_comm.Get_rank(), mpi_comm.Get_size())
if mpi_comm is not None
else (torch_group.rank(), torch_group.size())
)
self.bootstrap = CppTcpBootstrap.create(rank, size)
if rank == 0:
uniq_id = self.bootstrap.create_unique_id()
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
if mpi_comm is not None:
import mpi4py
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
else:
import torch
import torch.distributed as dist
if rank == 0:
uniq_id_global = uniq_id
pickled_data = pickle.dumps(uniq_id)
data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone()
else:
data_tensor = torch.zeros(256, dtype=torch.uint8)
dist.broadcast(data_tensor, src=0, group=torch_group)
uniq_id_global = pickle.loads(data_tensor.numpy().tobytes())
self.bootstrap.initialize(uniq_id_global)
elif mpi_comm:
# use this instead
self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
self.bootstrap.initialize(interfaceIpPortTrio)
elif not interfaceIpPortTrio == "":
assert rank >= 0 and size >= 1
self.bootstrap = CppTcpBootstrap.create(rank, size)

View File

@@ -3,7 +3,7 @@
from __future__ import annotations
from typing import Union
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer
import atexit
from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
@@ -58,7 +58,7 @@ class AlgorithmCollectionBuilder:
rank: int,
) -> AlgorithmCollection:
if self._flag_buffer is None:
self._flag_buffer = get_default_flag_buffer()
self._flag_buffer = get_flag_buffer()
native_collection = self._builder.build_default_algorithms(
int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
)

View File

@@ -63,10 +63,13 @@ class MyProxyService {
};
NB_MODULE(_ext, m) {
#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
nb::set_leak_warnings(false);
#endif
nb::class_<MyProxyService>(m, "MyProxyService")
.def(nb::init<int, int, int, nb::list, nb::list>(), nb::arg("rank"), nb::arg("nranks"), nb::arg("data_size"),
nb::arg("reg_mem_list"), nb::arg("sem_list"))
.def("fifo_device_handle", &MyProxyService::fifoDeviceHandle)
.def("start", &MyProxyService::start)
.def("stop", &MyProxyService::stop);
}
}

View File

@@ -199,18 +199,23 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
// TODO: implement this
void DslAlgorithm::reset() {}
static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
static uint32_t* gDefaultFlagBuffer = nullptr;
static std::weak_ptr<void> gDefaultFlagBufferWeak;
static size_t gDefaultFlagCount = 128;
std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
if (!flagBuffer) {
flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
gDefaultFlagBuffer = flagBuffer;
std::pair<std::shared_ptr<void>, size_t> getFlagBuffer() {
auto ptr = gDefaultFlagBufferWeak.lock();
if (!ptr) {
if (!gDefaultFlagBuffer) {
// Intentionally never freed — CUDA driver reclaims GPU memory at process exit.
gDefaultFlagBuffer = static_cast<uint32_t*>(mscclpp::detail::gpuCalloc(gDefaultFlagCount * sizeof(uint32_t)));
std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
mscclpp::gpuMemcpy(gDefaultFlagBuffer, initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
}
ptr = std::shared_ptr<void>(gDefaultFlagBuffer, [](void*) {});
gDefaultFlagBufferWeak = ptr;
}
return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
return {ptr, gDefaultFlagCount * sizeof(uint32_t)};
}
} // namespace mscclpp

View File

@@ -294,7 +294,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
auto [buffer, size] = mscclpp::getDefaultFlagBuffer();
auto [buffer, size] = mscclpp::getFlagBuffer();
commPtr->flagBuffer_ = buffer;
commPtr->flagBufferSize_ = size;

View File

@@ -1,7 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz)
FetchContent_Declare(json
GIT_REPOSITORY https://github.com/nlohmann/json.git
GIT_TAG v3.12.0
)
FetchContent_MakeAvailable(json)
function(add_mscclpp_test_executable name sources)