// Copyright (c) Microsoft Corporation. // Licensed under the MIT License. #include #include #include #include #include #include #include #include #include "utils_internal.hpp" double parseSize(const char* value) { std::string valueStr(value); std::istringstream iss(valueStr); long long int units; double size; char size_lit = 0; if (iss >> size) { iss >> std::ws; // eat whitespace iss >> size_lit; } else { return -1.0; } if (size_lit != 0 && !std::isspace(size_lit)) { switch (size_lit) { case 'G': case 'g': units = 1024 * 1024 * 1024; break; case 'M': case 'm': units = 1024 * 1024; break; case 'K': case 'k': units = 1024; break; default: return -1.0; }; } else { units = 1; } return size * units; } mscclpp::PacketType parsePacketType(const char* value) { std::string valueStr(value); if (valueStr == "LL8") { return mscclpp::PacketType::LL8; } else if (valueStr == "LL16") { return mscclpp::PacketType::LL16; } else { throw std::runtime_error("Invalid packet type"); } } double benchTime(int rank, std::shared_ptr bootstrap, std::shared_ptr executor, const mscclpp::ExecutionPlan& plan, std::shared_ptr sendbuff, size_t bufferSize, int niters, int ngrapthIters, mscclpp::PacketType packetType) { mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); cudaGraph_t graph; cudaGraphExec_t graphExec; mscclpp::Timer timer; MSCCLPP_CUDATHROW(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); for (int i = 0; i < niters; i++) { executor->execute(rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, plan, stream, packetType); } MSCCLPP_CUDATHROW(cudaStreamEndCapture(stream, &graph)); MSCCLPP_CUDATHROW(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream)); MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); bootstrap->barrier(); timer.reset(); for (int i = 0; i < ngrapthIters; i++) { MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream)); } MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); double deltaSec = timer.elapsed() * 1.e-6; deltaSec = deltaSec / (niters) / (ngrapthIters); MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec)); MSCCLPP_CUDATHROW(cudaGraphDestroy(graph)); return deltaSec; } int main(int argc, char* argv[]) { if (argc != 5 && argc != 6) { std::cerr << "Usage: " << argv[0] << " " << " (optional) " << std::endl; return 1; } int rank; int worldSize; MPI_Init(NULL, NULL); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); MSCCLPP_CUDATHROW(cudaSetDevice(rank)); const size_t bufferSize = parseSize(argv[1]); const std::string executionPlanPath = argv[2]; const int niters = std::stoi(argv[3]); const int ngraphIters = std::stoi(argv[4]); const char* npkitDumpDir = mscclpp::env()->npkitDumpDir.c_str(); mscclpp::PacketType packetType = mscclpp::PacketType::LL16; if (argc == 6) { packetType = parsePacketType(argv[5]); } std::shared_ptr bootstrap; mscclpp::UniqueId id; bootstrap = std::make_shared(rank, worldSize); if (rank == 0) id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); std::shared_ptr communicator = std::make_shared(bootstrap); std::shared_ptr executor = std::make_shared(communicator); if (npkitDumpDir != nullptr) { NpKit::Init(rank); } mscclpp::ExecutionPlan plan(executionPlanPath, rank); std::shared_ptr sendbuff = mscclpp::GpuBuffer(bufferSize).memory(); std::vector dataHost(bufferSize / sizeof(int), rank); MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice)); double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType); if (npkitDumpDir != nullptr) { NpKit::Dump(npkitDumpDir); NpKit::Shutdown(); } double latencyUs = deltaSec * 1.e6; std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl; MPI_Finalize(); return 0; }