mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
- Removes the GTest dependency, replacing it with a minimal custom framework (`test/framework.*`) that covers only what the tests actually use — a unified `TEST()` macro with SFINAE-based fixture auto-detection, `EXPECT_*`/`ASSERT_*` assertions, environments, and setup/teardown. - `--exclude-perf-tests` flag and substring-based negative filtering - `MSCCLPP_ENABLE_COVERAGE` CMake option with gcov/lcov; CI uploads to Codecov - Merges standalone `test/perf/` into main test targets - Refactors Azure pipelines to reduce redundancies & make more readable --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
301 lines
10 KiB
Plaintext
301 lines
10 KiB
Plaintext
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT License.
|
|
|
|
#include <mpi.h>
|
|
|
|
#include <mscclpp/gpu_utils.hpp>
|
|
#include <mscclpp/semaphore.hpp>
|
|
|
|
#include "mp_unit_tests.hpp"
|
|
|
|
void CommunicatorTestBase::SetUp() {
|
|
MultiProcessTest::SetUp();
|
|
|
|
if (numRanksToUse == -1) {
|
|
numRanksToUse = gEnv->worldSize;
|
|
}
|
|
ASSERT_LE(numRanksToUse, gEnv->worldSize);
|
|
|
|
ibTransport = ibIdToTransport(rankToLocalRank(gEnv->rank));
|
|
MSCCLPP_CUDATHROW(cudaSetDevice(rankToLocalRank(gEnv->rank)));
|
|
|
|
std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
|
|
mscclpp::UniqueId id;
|
|
if (gEnv->rank < numRanksToUse) {
|
|
bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, numRanksToUse);
|
|
if (gEnv->rank == 0) id = bootstrap->createUniqueId();
|
|
}
|
|
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
|
|
|
|
if (gEnv->rank >= numRanksToUse) {
|
|
return;
|
|
}
|
|
bootstrap->initialize(id);
|
|
communicator = std::make_shared<mscclpp::Communicator>(bootstrap);
|
|
}
|
|
|
|
void CommunicatorTestBase::TearDown() {
|
|
connections.clear();
|
|
communicator.reset();
|
|
registeredMemories.clear();
|
|
MultiProcessTest::TearDown();
|
|
}
|
|
|
|
void CommunicatorTestBase::setNumRanksToUse(int num) { numRanksToUse = num; }
|
|
|
|
void CommunicatorTestBase::connectMesh(bool useIpc, bool useIb, bool useEthernet) {
|
|
std::vector<std::shared_future<mscclpp::Connection>> connectionFutures(numRanksToUse);
|
|
std::vector<std::shared_future<mscclpp::Connection>> cpuConnectionFutures(numRanksToUse);
|
|
for (int i = 0; i < numRanksToUse; i++) {
|
|
if (i != gEnv->rank) {
|
|
if ((rankToNode(i) == rankToNode(gEnv->rank)) && useIpc) {
|
|
connectionFutures[i] = communicator->connect(mscclpp::Transport::CudaIpc, i);
|
|
} else if (useIb) {
|
|
connectionFutures[i] = communicator->connect(ibTransport, i);
|
|
cpuConnectionFutures[i] = communicator->connect({ibTransport, mscclpp::DeviceType::CPU}, i);
|
|
} else if (useEthernet) {
|
|
connectionFutures[i] = communicator->connect(mscclpp::Transport::Ethernet, i);
|
|
cpuConnectionFutures[i] = communicator->connect({mscclpp::Transport::Ethernet, mscclpp::DeviceType::CPU}, i);
|
|
}
|
|
}
|
|
}
|
|
for (int i = 0; i < numRanksToUse; i++) {
|
|
if (i != gEnv->rank) {
|
|
connections[i] = connectionFutures[i].get();
|
|
if (cpuConnectionFutures[i].valid()) {
|
|
cpuConnections[i] = cpuConnectionFutures[i].get();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Register a local memory and receive corresponding remote memories
|
|
void CommunicatorTestBase::registerMemoryPairs(void* buff, size_t buffSize, mscclpp::TransportFlags transport, int tag,
|
|
const std::vector<int>& remoteRanks,
|
|
mscclpp::RegisteredMemory& localMemory,
|
|
std::unordered_map<int, mscclpp::RegisteredMemory>& remoteMemories) {
|
|
localMemory = communicator->registerMemory(buff, buffSize, transport);
|
|
std::unordered_map<int, std::shared_future<mscclpp::RegisteredMemory>> futureRemoteMemories;
|
|
for (int remoteRank : remoteRanks) {
|
|
if (remoteRank != communicator->bootstrap()->getRank()) {
|
|
communicator->sendMemory(localMemory, remoteRank, tag);
|
|
futureRemoteMemories[remoteRank] = communicator->recvMemory(remoteRank, tag);
|
|
}
|
|
}
|
|
for (int remoteRank : remoteRanks) {
|
|
if (remoteRank != communicator->bootstrap()->getRank()) {
|
|
remoteMemories[remoteRank] = futureRemoteMemories[remoteRank].get();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Register a local memory an receive one corresponding remote memory
|
|
void CommunicatorTestBase::registerMemoryPair(void* buff, size_t buffSize, mscclpp::TransportFlags transport, int tag,
|
|
int remoteRank, mscclpp::RegisteredMemory& localMemory,
|
|
mscclpp::RegisteredMemory& remoteMemory) {
|
|
std::vector<int> remoteRanks = {remoteRank};
|
|
std::unordered_map<int, mscclpp::RegisteredMemory> remoteMemories;
|
|
registerMemoryPairs(buff, buffSize, transport, tag, remoteRanks, localMemory, remoteMemories);
|
|
remoteMemory = remoteMemories[remoteRank];
|
|
}
|
|
|
|
void CommunicatorTest::SetUp() {
|
|
CommunicatorTestBase::SetUp();
|
|
|
|
ASSERT_EQ((deviceBufferSize / sizeof(int)) % gEnv->worldSize, 0);
|
|
|
|
#if defined(USE_IBVERBS)
|
|
connectMesh(true, true, false);
|
|
#else
|
|
connectMesh(true, false, false);
|
|
#endif
|
|
|
|
devicePtr.resize(numBuffers);
|
|
localMemory.resize(numBuffers);
|
|
remoteMemory.resize(numBuffers);
|
|
|
|
std::vector<int> remoteRanks;
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank) {
|
|
remoteRanks.push_back(i);
|
|
}
|
|
}
|
|
|
|
#if defined(USE_IBVERBS)
|
|
auto transport = mscclpp::Transport::CudaIpc | ibTransport;
|
|
#else
|
|
auto transport = mscclpp::Transport::CudaIpc;
|
|
#endif
|
|
|
|
for (size_t n = 0; n < numBuffers; n++) {
|
|
devicePtr[n] = mscclpp::detail::gpuCallocShared<int>(deviceBufferSize / sizeof(int));
|
|
registerMemoryPairs(devicePtr[n].get(), deviceBufferSize, transport, 0, remoteRanks, localMemory[n],
|
|
remoteMemory[n]);
|
|
}
|
|
}
|
|
|
|
void CommunicatorTest::TearDown() {
|
|
remoteMemory.clear();
|
|
localMemory.clear();
|
|
devicePtr.clear();
|
|
CommunicatorTestBase::TearDown();
|
|
}
|
|
|
|
void CommunicatorTest::deviceBufferInit() {
|
|
size_t dataCount = deviceBufferSize / sizeof(int);
|
|
for (int n = 0; n < (int)devicePtr.size(); n++) {
|
|
std::vector<int> hostBuffer(dataCount, 0);
|
|
for (size_t i = 0; i < dataCount; i++) {
|
|
hostBuffer[i] = gEnv->rank + n * gEnv->worldSize;
|
|
}
|
|
mscclpp::gpuMemcpy<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
|
|
}
|
|
}
|
|
|
|
void CommunicatorTest::writeToRemote(int dataCountPerRank) {
|
|
for (size_t n = 0; n < numBuffers; n++) {
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank) {
|
|
auto& conn = connections.at(i);
|
|
auto& peerMemory = remoteMemory[n].at(i);
|
|
conn.write(peerMemory, gEnv->rank * dataCountPerRank * sizeof(int), localMemory[n],
|
|
gEnv->rank * dataCountPerRank * sizeof(int), dataCountPerRank * sizeof(int));
|
|
conn.flush();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
|
|
size_t dataCount = deviceBufferSize / sizeof(int);
|
|
for (int n = 0; n < (int)devicePtr.size(); n++) {
|
|
std::vector<int> hostBuffer(dataCount, 0);
|
|
mscclpp::gpuMemcpy<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (((i / gEnv->nRanksPerNode) == (gEnv->rank / gEnv->nRanksPerNode)) && skipLocal) {
|
|
continue;
|
|
}
|
|
for (size_t j = i * dataCount / gEnv->worldSize; j < (i + 1) * dataCount / gEnv->worldSize; j++) {
|
|
if (hostBuffer[j] != i + n * gEnv->worldSize) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
TEST(CommunicatorTest, BasicWrite) {
|
|
if (gEnv->rank >= numRanksToUse) return;
|
|
|
|
deviceBufferInit();
|
|
communicator->bootstrap()->barrier();
|
|
|
|
writeToRemote(deviceBufferSize / sizeof(int) / gEnv->worldSize);
|
|
communicator->bootstrap()->barrier();
|
|
|
|
// polling until it becomes ready
|
|
bool ready = false;
|
|
int niter = 0;
|
|
do {
|
|
ready = testWriteCorrectness();
|
|
niter++;
|
|
if (niter == 10000) {
|
|
FAIL() << "Polling is stuck.";
|
|
}
|
|
} while (!ready);
|
|
communicator->bootstrap()->barrier();
|
|
}
|
|
|
|
__global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle* deviceSemaphores, int rank,
|
|
int worldSize) {
|
|
int tid = threadIdx.x;
|
|
if (tid != rank && tid < worldSize) {
|
|
deviceSemaphores[tid].wait();
|
|
}
|
|
}
|
|
|
|
TEST(CommunicatorTest, WriteWithDeviceSemaphores) {
|
|
if (gEnv->rank >= numRanksToUse) return;
|
|
|
|
std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> semaphores;
|
|
for (auto entry : connections) {
|
|
auto& conn = entry.second;
|
|
semaphores.insert({entry.first, std::make_shared<mscclpp::Host2DeviceSemaphore>(*communicator.get(), conn)});
|
|
}
|
|
communicator->bootstrap()->barrier();
|
|
|
|
deviceBufferInit();
|
|
communicator->bootstrap()->barrier();
|
|
|
|
auto deviceSemaphoreHandles =
|
|
mscclpp::detail::gpuCallocShared<mscclpp::Host2DeviceSemaphore::DeviceHandle>(gEnv->worldSize);
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank) {
|
|
mscclpp::Host2DeviceSemaphore::DeviceHandle deviceHandle = semaphores[i]->deviceHandle();
|
|
mscclpp::gpuMemcpy<mscclpp::Host2DeviceSemaphore::DeviceHandle>(deviceSemaphoreHandles.get() + i, &deviceHandle,
|
|
1, cudaMemcpyHostToDevice);
|
|
}
|
|
}
|
|
communicator->bootstrap()->barrier();
|
|
|
|
writeToRemote(deviceBufferSize / sizeof(int) / gEnv->worldSize);
|
|
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank) {
|
|
semaphores[i]->signal();
|
|
}
|
|
}
|
|
|
|
kernelWaitSemaphores<<<1, gEnv->worldSize>>>(deviceSemaphoreHandles.get(), gEnv->rank, gEnv->worldSize);
|
|
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
|
|
|
ASSERT_TRUE(testWriteCorrectness());
|
|
communicator->bootstrap()->barrier();
|
|
}
|
|
|
|
TEST(CommunicatorTest, WriteWithHostSemaphores) {
|
|
if (gEnv->rank >= numRanksToUse) return;
|
|
|
|
std::unordered_map<int, std::shared_ptr<mscclpp::Host2HostSemaphore>> semaphores;
|
|
// Iterate over cpuConnections, which is guaranteed by design to only contain CPU connections.
|
|
for (auto entry : cpuConnections) {
|
|
auto& conn = entry.second;
|
|
// Host2HostSemaphore cannot be used with CudaIpc transport
|
|
if (conn.transport() == mscclpp::Transport::CudaIpc) continue;
|
|
semaphores.insert({entry.first, std::make_shared<mscclpp::Host2HostSemaphore>(*communicator.get(), conn)});
|
|
}
|
|
communicator->bootstrap()->barrier();
|
|
|
|
deviceBufferInit();
|
|
communicator->bootstrap()->barrier();
|
|
|
|
writeToRemote(deviceBufferSize / sizeof(int) / gEnv->worldSize);
|
|
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank && connections[i].transport() != mscclpp::Transport::CudaIpc) {
|
|
semaphores[i]->signal();
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank && connections[i].transport() != mscclpp::Transport::CudaIpc) {
|
|
semaphores[i]->wait();
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank && connections[i].transport() != mscclpp::Transport::CudaIpc) {
|
|
semaphores[i]->signal();
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < gEnv->worldSize; i++) {
|
|
if (i != gEnv->rank && connections[i].transport() != mscclpp::Transport::CudaIpc) {
|
|
semaphores[i]->wait();
|
|
}
|
|
}
|
|
|
|
ASSERT_TRUE(testWriteCorrectness());
|
|
communicator->bootstrap()->barrier();
|
|
} |