Files
mscclpp/test/mp_unit/communicator_tests.cu
Changho Hwang 6cd8960394 DirectChannel Unit Tests (#102)
* Add DirectChannel unit tests
* Split mp_unit_tests.cu into multiple files
2023-06-15 20:55:57 +08:00

277 lines
9.2 KiB
Plaintext

#include <mpi.h>
#include <mscclpp/cuda_utils.hpp>
#include <mscclpp/epoch.hpp>
#include "mp_unit_tests.hpp"
void CommunicatorTestBase::SetUp() {
MultiProcessTest::SetUp();
if (numRanksToUse == -1) {
numRanksToUse = gEnv->worldSize;
}
ASSERT_LE(numRanksToUse, gEnv->worldSize);
std::shared_ptr<mscclpp::Bootstrap> bootstrap;
mscclpp::UniqueId id;
if (gEnv->rank < numRanksToUse) {
bootstrap = std::make_shared<mscclpp::Bootstrap>(gEnv->rank, numRanksToUse);
if (gEnv->rank == 0) id = bootstrap->createUniqueId();
}
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
if (gEnv->rank >= numRanksToUse) {
return;
}
bootstrap->initialize(id);
communicator = std::make_shared<mscclpp::Communicator>(bootstrap);
ibTransport = ibIdToTransport(rankToLocalRank(gEnv->rank));
}
void CommunicatorTestBase::TearDown() {
connections.clear();
communicator.reset();
MultiProcessTest::TearDown();
}
void CommunicatorTestBase::setNumRanksToUse(int num) { numRanksToUse = num; }
int CommunicatorTestBase::rankToLocalRank(int rank) const { return rank % gEnv->nRanksPerNode; }
int CommunicatorTestBase::rankToNode(int rank) const { return rank / gEnv->nRanksPerNode; }
void CommunicatorTestBase::connectMesh(bool useIbOnly) {
for (int i = 0; i < numRanksToUse; i++) {
if (i != gEnv->rank) {
if ((rankToNode(i) == rankToNode(gEnv->rank)) && !useIbOnly) {
connections[i] = communicator->connectOnSetup(i, 0, mscclpp::Transport::CudaIpc);
} else {
connections[i] = communicator->connectOnSetup(i, 0, ibTransport);
}
}
}
communicator->setup();
}
// Register a local memory and receive corresponding remote memories
void CommunicatorTestBase::registerMemoryPairs(void* buff, size_t buffSize, mscclpp::TransportFlags transport, int tag,
const std::vector<int>& remoteRanks,
mscclpp::RegisteredMemory& localMemory,
std::unordered_map<int, mscclpp::RegisteredMemory>& remoteMemories) {
localMemory = communicator->registerMemory(buff, buffSize, transport);
std::unordered_map<int, mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> futureRemoteMemories;
for (int remoteRank : remoteRanks) {
if (remoteRank != communicator->bootstrapper()->getRank()) {
communicator->sendMemoryOnSetup(localMemory, remoteRank, tag);
futureRemoteMemories[remoteRank] = communicator->recvMemoryOnSetup(remoteRank, tag);
}
}
communicator->setup();
for (int remoteRank : remoteRanks) {
if (remoteRank != communicator->bootstrapper()->getRank()) {
remoteMemories[remoteRank] = futureRemoteMemories[remoteRank].get();
}
}
}
// Register a local memory an receive one corresponding remote memory
void CommunicatorTestBase::registerMemoryPair(void* buff, size_t buffSize, mscclpp::TransportFlags transport, int tag,
int remoteRank, mscclpp::RegisteredMemory& localMemory,
mscclpp::RegisteredMemory& remoteMemory) {
std::vector<int> remoteRanks = {remoteRank};
std::unordered_map<int, mscclpp::RegisteredMemory> remoteMemories;
registerMemoryPairs(buff, buffSize, transport, tag, remoteRanks, localMemory, remoteMemories);
remoteMemory = remoteMemories[remoteRank];
}
void CommunicatorTest::SetUp() {
CommunicatorTestBase::SetUp();
ASSERT_EQ((deviceBufferSize / sizeof(int)) % gEnv->worldSize, 0);
connectMesh();
devicePtr.resize(numBuffers);
localMemory.resize(numBuffers);
remoteMemory.resize(numBuffers);
std::vector<int> remoteRanks;
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank) {
remoteRanks.push_back(i);
}
}
for (int n = 0; n < numBuffers; n++) {
devicePtr[n] = mscclpp::allocSharedCuda<int>(deviceBufferSize / sizeof(int));
registerMemoryPairs(devicePtr[n].get(), deviceBufferSize, mscclpp::Transport::CudaIpc | ibTransport, 0, remoteRanks,
localMemory[n], remoteMemory[n]);
}
}
void CommunicatorTest::TearDown() {
remoteMemory.clear();
localMemory.clear();
devicePtr.clear();
CommunicatorTestBase::TearDown();
}
void CommunicatorTest::deviceBufferInit() {
size_t dataCount = deviceBufferSize / sizeof(int);
for (int n = 0; n < (int)devicePtr.size(); n++) {
std::vector<int> hostBuffer(dataCount, 0);
for (int i = 0; i < dataCount; i++) {
hostBuffer[i] = gEnv->rank + n * gEnv->worldSize;
}
mscclpp::memcpyCuda<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
}
}
void CommunicatorTest::writeToRemote(int dataCountPerRank) {
for (int n = 0; n < numBuffers; n++) {
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank) {
auto& conn = connections.at(i);
auto& peerMemory = remoteMemory[n].at(i);
conn->write(peerMemory, gEnv->rank * dataCountPerRank * sizeof(int), localMemory[n],
gEnv->rank * dataCountPerRank * sizeof(int), dataCountPerRank * sizeof(int));
conn->flush();
}
}
}
}
bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
size_t dataCount = deviceBufferSize / sizeof(int);
for (int n = 0; n < (int)devicePtr.size(); n++) {
std::vector<int> hostBuffer(dataCount, 0);
mscclpp::memcpyCuda<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
for (int i = 0; i < gEnv->worldSize; i++) {
if (((i / gEnv->nRanksPerNode) == (gEnv->rank / gEnv->nRanksPerNode)) && skipLocal) {
continue;
}
for (int j = i * dataCount / gEnv->worldSize; j < (i + 1) * dataCount / gEnv->worldSize; j++) {
if (hostBuffer[j] != i + n * gEnv->worldSize) {
return false;
}
}
}
}
return true;
}
TEST_F(CommunicatorTest, BasicWrite) {
if (gEnv->rank >= numRanksToUse) return;
deviceBufferInit();
communicator->bootstrapper()->barrier();
writeToRemote(deviceBufferSize / sizeof(int) / gEnv->worldSize);
communicator->bootstrapper()->barrier();
// polling until it becomes ready
bool ready = false;
int niter = 0;
do {
ready = testWriteCorrectness();
niter++;
if (niter == 10000) {
FAIL() << "Polling is stuck.";
}
} while (!ready);
communicator->bootstrapper()->barrier();
}
__global__ void kernelWaitEpochs(mscclpp::DeviceEpoch::DeviceHandle* deviceEpochs, int rank, int worldSize) {
int tid = threadIdx.x;
if (tid != rank && tid < worldSize) {
deviceEpochs[tid].wait();
}
}
TEST_F(CommunicatorTest, WriteWithDeviceEpochs) {
if (gEnv->rank >= numRanksToUse) return;
std::unordered_map<int, std::shared_ptr<mscclpp::DeviceEpoch>> epochs;
for (auto entry : connections) {
auto& conn = entry.second;
epochs.insert({entry.first, std::make_shared<mscclpp::DeviceEpoch>(*communicator.get(), conn)});
}
communicator->setup();
communicator->bootstrapper()->barrier();
deviceBufferInit();
communicator->bootstrapper()->barrier();
auto deviceEpochHandles = mscclpp::allocSharedCuda<mscclpp::DeviceEpoch::DeviceHandle>(gEnv->worldSize);
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank) {
mscclpp::DeviceEpoch::DeviceHandle deviceHandle = epochs[i]->deviceHandle();
mscclpp::memcpyCuda<mscclpp::DeviceEpoch::DeviceHandle>(deviceEpochHandles.get() + i, &deviceHandle, 1,
cudaMemcpyHostToDevice);
}
}
communicator->bootstrapper()->barrier();
writeToRemote(deviceBufferSize / sizeof(int) / gEnv->worldSize);
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank) {
epochs[i]->signal();
}
}
kernelWaitEpochs<<<1, gEnv->worldSize>>>(deviceEpochHandles.get(), gEnv->rank, gEnv->worldSize);
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
ASSERT_TRUE(testWriteCorrectness());
communicator->bootstrapper()->barrier();
}
TEST_F(CommunicatorTest, WriteWithHostEpochs) {
if (gEnv->rank >= numRanksToUse) return;
std::unordered_map<int, std::shared_ptr<mscclpp::HostEpoch>> epochs;
for (auto entry : connections) {
auto& conn = entry.second;
// HostEpoch cannot be used with CudaIpc transport
if (conn->transport() == mscclpp::Transport::CudaIpc) continue;
epochs.insert({entry.first, std::make_shared<mscclpp::HostEpoch>(*communicator.get(), conn)});
}
communicator->setup();
communicator->bootstrapper()->barrier();
deviceBufferInit();
communicator->bootstrapper()->barrier();
writeToRemote(deviceBufferSize / sizeof(int) / gEnv->worldSize);
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank && connections[i]->transport() != mscclpp::Transport::CudaIpc) {
epochs[i]->signal();
}
}
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank && connections[i]->transport() != mscclpp::Transport::CudaIpc) {
epochs[i]->wait();
}
}
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank && connections[i]->transport() != mscclpp::Transport::CudaIpc) {
epochs[i]->signal();
}
}
for (int i = 0; i < gEnv->worldSize; i++) {
if (i != gEnv->rank && connections[i]->transport() != mscclpp::Transport::CudaIpc) {
epochs[i]->wait();
}
}
ASSERT_TRUE(testWriteCorrectness());
communicator->bootstrapper()->barrier();
}