Files
mscclpp/test/mp_unit/executor_tests.cc
Changho Hwang 34945fb107 Add GpuBuffer class (#423)
* Renamed and moved mem alloc functions into the `mscclpp::detail::`
namespace (now `mscclpp::detail::gpuCalloc*<T>()`)
* Deprecated constructor-calling mem alloc functions
(`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`)
* Added a new `mscclpp::GpuBuffer<T>()` class that should be used in
general for allocating communication buffers
* Added a new `mscclpp.utils.GpuBuffer` Python class that inherits
`cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc`
* Renamed `mscclpp::memcpyCuda*<T>()` functions into
`mscclpp::gpuMemcpy*<T>()` for name consistency
* A few fixes in NVLS memory allocation
* Tackled minor compiler warnings
2025-01-07 18:40:01 -08:00

66 lines
2.2 KiB
C++

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <mpi.h>
#include <filesystem>
#include <mscclpp/npkit/npkit.hpp>
#include "mp_unit_tests.hpp"
namespace {
std::string getExecutablePath() {
char result[PATH_MAX];
ssize_t count = readlink("/proc/self/exe", result, PATH_MAX);
if (count == -1) {
throw std::runtime_error("Failed to get executable path");
}
return std::string(result, count);
}
} // namespace
void ExecutorTest::SetUp() {
MultiProcessTest::SetUp();
MSCCLPP_CUDATHROW(cudaSetDevice(rankToLocalRank(gEnv->rank)));
std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
mscclpp::UniqueId id;
bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
if (gEnv->rank == 0) id = bootstrap->createUniqueId();
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
bootstrap->initialize(id);
std::shared_ptr<mscclpp::Communicator> communicator = std::make_shared<mscclpp::Communicator>(bootstrap);
executor = std::make_shared<mscclpp::Executor>(communicator);
npkitDumpDir = getenv("NPKIT_DUMP_DIR");
if (npkitDumpDir != nullptr) {
NpKit::Init(gEnv->rank);
}
}
void ExecutorTest::TearDown() {
if (npkitDumpDir != nullptr) {
NpKit::Dump(npkitDumpDir);
NpKit::Shutdown();
}
executor.reset();
MultiProcessTest::TearDown();
}
TEST_F(ExecutorTest, TwoNodesAllreduce) {
if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
return;
}
std::string executablePath = getExecutablePath();
std::filesystem::path path = executablePath;
std::filesystem::path executionFilesPath =
path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
mscclpp::ExecutionPlan plan(executionFilesPath.string());
const int bufferSize = 1024 * 1024;
std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16,
plan, stream);
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
}