Files
mscclpp/test/mp_unit/executor_tests.cc
Binyang Li be6a941fba New DSL implementation (#579)
The PR contains following changes:
Python side:
- Channel based DSL implementation: decouple channel with chunk.
- Users create channel explicitly, only need local_rank, remote_rank and
channel_type
- Adjust executor json file, add remote_buffer fields, different op can
use different channel and remote buffers combination.
- Reimplement operation fusion, data dependency check mechanism
- Add new op such as semaphore, pipeline 
- Clean code and enhance document
C++ side: 
- Support new execution file json format
- Support semaphore and pipeline operation
- code clean, support non-zero copy scenario

---------

Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
2025-08-09 00:36:20 -07:00

67 lines
2.2 KiB
C++

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <mpi.h>
#include <filesystem>
#include <mscclpp/env.hpp>
#include <mscclpp/npkit/npkit.hpp>
#include "mp_unit_tests.hpp"
namespace {
std::string getExecutablePath() {
char result[PATH_MAX];
ssize_t count = readlink("/proc/self/exe", result, PATH_MAX);
if (count == -1) {
throw std::runtime_error("Failed to get executable path");
}
return std::string(result, count);
}
} // namespace
void ExecutorTest::SetUp() {
MultiProcessTest::SetUp();
MSCCLPP_CUDATHROW(cudaSetDevice(rankToLocalRank(gEnv->rank)));
std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
mscclpp::UniqueId id;
bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
if (gEnv->rank == 0) id = bootstrap->createUniqueId();
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
bootstrap->initialize(id);
std::shared_ptr<mscclpp::Communicator> communicator = std::make_shared<mscclpp::Communicator>(bootstrap);
executor = std::make_shared<mscclpp::Executor>(communicator);
npkitDumpDir = mscclpp::env()->npkitDumpDir;
if (npkitDumpDir != "") {
NpKit::Init(gEnv->rank);
}
}
void ExecutorTest::TearDown() {
if (npkitDumpDir != "") {
NpKit::Dump(npkitDumpDir);
NpKit::Shutdown();
}
executor.reset();
MultiProcessTest::TearDown();
}
TEST_F(ExecutorTest, TwoNodesAllreduce) {
if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
return;
}
std::string executablePath = getExecutablePath();
std::filesystem::path path = executablePath;
std::filesystem::path executionFilesPath =
path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
mscclpp::ExecutionPlan plan(executionFilesPath.string(), gEnv->rank);
const int bufferSize = 1024 * 1024;
std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16,
plan, stream);
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
}