mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 09:46:00 +00:00
Reorganize current native algorithm implementation and DSL algorithm implementation. Provide unified API for DSL algo and native algo and provide interface to tune the algo Provide interface for pytorch integration with native API and DSL --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
699 lines
27 KiB
C++
699 lines
27 KiB
C++
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#include "execution_plan.hpp"
|
|
|
|
#include <cassert>
|
|
#include <cstdlib>
|
|
#include <filesystem>
|
|
#include <fstream>
|
|
#include <iomanip>
|
|
#include <set>
|
|
#include <sstream>
|
|
|
|
#include "debug.h"
|
|
|
|
namespace {
|
|
|
|
template <typename T, typename Predicate>
|
|
std::vector<T> filter(const std::vector<T>& vec, Predicate pred) {
|
|
std::vector<T> filtered;
|
|
std::copy_if(vec.begin(), vec.end(), std::back_inserter(filtered), pred);
|
|
return filtered;
|
|
}
|
|
|
|
auto getOpType = [](const std::string& str) {
|
|
if (str == "nop") {
|
|
return mscclpp::OperationType::NOP;
|
|
} else if (str == "barrier") {
|
|
return mscclpp::OperationType::BARRIER;
|
|
} else if (str == "put") {
|
|
return mscclpp::OperationType::PUT;
|
|
} else if (str == "pws") {
|
|
return mscclpp::OperationType::PUT_WITH_SIGNAL;
|
|
} else if (str == "pwsf") {
|
|
return mscclpp::OperationType::PUT_WITH_SIGNAL_AND_FLUSH;
|
|
} else if (str == "get") {
|
|
return mscclpp::OperationType::GET;
|
|
} else if (str == "copy") {
|
|
return mscclpp::OperationType::COPY;
|
|
} else if (str == "signal") {
|
|
return mscclpp::OperationType::SIGNAL;
|
|
} else if (str == "wait") {
|
|
return mscclpp::OperationType::WAIT;
|
|
} else if (str == "flush") {
|
|
return mscclpp::OperationType::FLUSH;
|
|
} else if (str == "re") {
|
|
return mscclpp::OperationType::REDUCE;
|
|
} else if (str == "res") {
|
|
return mscclpp::OperationType::REDUCE_SEND;
|
|
} else if (str == "rre") {
|
|
return mscclpp::OperationType::READ_REDUCE;
|
|
} else if (str == "rres") {
|
|
return mscclpp::OperationType::READ_REDUCE_SEND;
|
|
} else if (str == "ppkt") {
|
|
return mscclpp::OperationType::PUT_PACKETS;
|
|
} else if (str == "rppkt") {
|
|
return mscclpp::OperationType::READ_PUT_PACKETS;
|
|
} else if (str == "respkt") {
|
|
return mscclpp::OperationType::REDUCE_SEND_PACKETS;
|
|
} else if (str == "cpkt") {
|
|
return mscclpp::OperationType::COPY_PACKETS;
|
|
} else if (str == "upkt") {
|
|
return mscclpp::OperationType::UNPACK_PACKETS;
|
|
} else if (str == "repkt") {
|
|
return mscclpp::OperationType::REDUCE_PACKETS;
|
|
} else if (str == "recpkt") {
|
|
return mscclpp::OperationType::REDUCE_COPY_PACKETS;
|
|
} else if (str == "recspkt") {
|
|
return mscclpp::OperationType::REDUCE_COPY_SEND_PACKETS;
|
|
} else if (str == "glres") {
|
|
return mscclpp::OperationType::MULTI_LOAD_REDUCE_STORE;
|
|
} else if (str == "rlxsignal") {
|
|
return mscclpp::OperationType::RELAXED_SIGNAL;
|
|
} else if (str == "rlxwait") {
|
|
return mscclpp::OperationType::RELAXED_WAIT;
|
|
} else if (str == "pipeline") {
|
|
return mscclpp::OperationType::PIPELINE;
|
|
} else if (str == "sem_acquire") {
|
|
return mscclpp::OperationType::SEM_ACQUIRE;
|
|
} else if (str == "sem_release") {
|
|
return mscclpp::OperationType::SEM_RELEASE;
|
|
} else {
|
|
throw mscclpp::Error("Invalid operation type: " + str, mscclpp::ErrorCode::ExecutorError);
|
|
}
|
|
};
|
|
|
|
auto convertToBufferType = [](const std::string& str) {
|
|
if (str == "i") {
|
|
return mscclpp::BufferType::INPUT;
|
|
} else if (str == "o") {
|
|
return mscclpp::BufferType::OUTPUT;
|
|
} else if (str == "s") {
|
|
return mscclpp::BufferType::SCRATCH;
|
|
} else {
|
|
throw mscclpp::Error("Invalid buffer type", mscclpp::ErrorCode::ExecutorError);
|
|
}
|
|
};
|
|
|
|
auto convertToChannelType = [](const std::string& str) {
|
|
if (str == "memory") {
|
|
return mscclpp::ChannelType::MEMORY;
|
|
} else if (str == "port") {
|
|
return mscclpp::ChannelType::PORT;
|
|
} else if (str == "none") {
|
|
return mscclpp::ChannelType::NONE;
|
|
} else if (str == "switch") {
|
|
return mscclpp::ChannelType::SWITCH;
|
|
} else {
|
|
throw mscclpp::Error("Invalid channel type", mscclpp::ErrorCode::ExecutorError);
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
namespace mscclpp {
|
|
using json = nlohmann::json;
|
|
|
|
ExecutionPlan::Impl::Impl(const std::string& planPath, int rank)
|
|
: planPath(planPath), isUsingPacket(false), rank(rank) {
|
|
std::ifstream file(this->planPath);
|
|
json obj = json::parse(file);
|
|
this->name = obj["name"];
|
|
this->collective = obj["collective"];
|
|
this->isInPlace = obj["inplace"];
|
|
this->reuseResources = obj.value("reuse_resources", false);
|
|
this->doubleScratchBuffer = obj.value("use_double_scratch_buffer", false);
|
|
this->bufferAlignment = obj.value("buffer_alignment", 16);
|
|
this->minMessageSize = obj.value("min_message_size", 0);
|
|
this->maxMessageSize = obj.value("max_message_size", std::numeric_limits<uint64_t>::max());
|
|
}
|
|
|
|
std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(ChannelType channelType) const {
|
|
auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; };
|
|
return filter(this->channelInfos_.at(rank), pred);
|
|
}
|
|
|
|
std::vector<ChannelInfo> ExecutionPlan::Impl::getUnpairedChannelInfos(int worldSize, ChannelType channelType) {
|
|
std::vector<ChannelInfo> unpaired;
|
|
for (int peer = 0; peer < worldSize; peer++) {
|
|
if (peer == rank) {
|
|
continue;
|
|
}
|
|
if (this->channelCountMap_[{rank, channelType}][peer] < this->channelCountMap_[{peer, channelType}][rank]) {
|
|
int count = this->channelCountMap_[{peer, channelType}][rank] - this->channelCountMap_[{rank, channelType}][peer];
|
|
for (int i = 0; i < count; i++) {
|
|
ChannelInfo info;
|
|
info.channelType = channelType;
|
|
info.connectedPeers.push_back(peer);
|
|
unpaired.push_back(info);
|
|
}
|
|
}
|
|
}
|
|
return unpaired;
|
|
}
|
|
|
|
std::vector<int> ExecutionPlan::Impl::getConnectedPeers() const {
|
|
std::set<int> peers;
|
|
for (const auto& info : this->channelInfos_.at(rank)) {
|
|
for (int peer : info.connectedPeers) {
|
|
peers.insert(peer);
|
|
}
|
|
}
|
|
if (this->channelInfosByDstRank_.find(rank) != this->channelInfosByDstRank_.end()) {
|
|
for (const auto& info : this->channelInfosByDstRank_.at(rank)) {
|
|
for (int peer : info.connectedPeers) {
|
|
peers.insert(peer);
|
|
}
|
|
}
|
|
}
|
|
return std::vector<int>(peers.begin(), peers.end());
|
|
}
|
|
|
|
std::vector<BufferInfo> ExecutionPlan::Impl::getRemoteBufferInfos() const {
|
|
if (this->remoteBufferInfos_.find(rank) == this->remoteBufferInfos_.end()) {
|
|
return std::vector<BufferInfo>();
|
|
}
|
|
return this->remoteBufferInfos_.at(rank);
|
|
}
|
|
|
|
std::vector<BufferInfo> ExecutionPlan::Impl::getLocalBufferToSend() const {
|
|
if (this->localBufferToSend_.find(rank) == this->localBufferToSend_.end()) {
|
|
return std::vector<BufferInfo>();
|
|
}
|
|
return this->localBufferToSend_.at(rank);
|
|
}
|
|
|
|
size_t ExecutionPlan::Impl::calScratchBufferSize(size_t inputSize, size_t outputSize) const {
|
|
if (reuseResources && this->scratchChunks > 0) {
|
|
return PREDFINED_SCRATCH_SIZE;
|
|
}
|
|
|
|
size_t sizePerChunk = 0;
|
|
size_t size = 0;
|
|
if (this->inputChunks != 0) {
|
|
sizePerChunk = (inputSize + this->inputChunks - 1) / this->inputChunks;
|
|
} else if (this->outputChunks != 0) {
|
|
sizePerChunk = (outputSize + this->outputChunks - 1) / this->outputChunks;
|
|
} else {
|
|
throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
|
|
}
|
|
|
|
if (this->isUsingPacket) {
|
|
size = sizePerChunk * this->scratchChunks * 2; /* data + flag*/
|
|
} else {
|
|
size = sizePerChunk * this->scratchChunks;
|
|
}
|
|
|
|
if (this->doubleScratchBuffer) {
|
|
size = size * 2;
|
|
}
|
|
return (size + this->bufferAlignment - 1) / this->bufferAlignment * this->bufferAlignment;
|
|
}
|
|
|
|
size_t ExecutionPlan::Impl::calMaxScratchChunkSize(size_t scratchSize) const {
|
|
if (this->scratchChunks == 0) {
|
|
return 0;
|
|
}
|
|
if (this->doubleScratchBuffer) {
|
|
scratchSize = scratchSize / 2;
|
|
}
|
|
size_t size = (scratchSize + this->scratchChunks - 1) / this->scratchChunks;
|
|
return (size + this->bufferAlignment - 1) / this->bufferAlignment * this->bufferAlignment;
|
|
}
|
|
|
|
std::vector<Operation> ExecutionPlan::Impl::getOperations(int threadblock) const {
|
|
return this->operations[threadblock];
|
|
}
|
|
|
|
int ExecutionPlan::Impl::getThreadblockCount() const { return this->operations.size(); }
|
|
|
|
void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize, size_t contsSrcOffset,
|
|
size_t constDstOffset) {
|
|
std::ifstream file(this->planPath);
|
|
json obj = json::parse(file);
|
|
if (this->name != obj["name"]) {
|
|
throw Error("Plan name does not match", ErrorCode::ExecutorError);
|
|
}
|
|
this->collective = obj["collective"];
|
|
std::string protocol = obj["protocol"];
|
|
if (protocol == "LL") {
|
|
this->isUsingPacket = true;
|
|
}
|
|
this->inputSize = inputSize;
|
|
this->outputSize = outputSize;
|
|
this->nThreadsPerBlock = obj.value("num_threads_per_block", 1024);
|
|
this->minMessageSize = obj.value("min_message_size", 0);
|
|
this->maxMessageSize = obj.value("max_message_size", std::numeric_limits<uint64_t>::max());
|
|
|
|
this->isInPlace = obj["inplace"];
|
|
const auto& gpus = obj["gpus"];
|
|
|
|
auto& gpu = gpus[rank];
|
|
if (gpu["id"] != rank) {
|
|
throw Error("GPU rank does not match", ErrorCode::ExecutorError);
|
|
}
|
|
this->inputChunks = gpu["input_chunks"];
|
|
this->outputChunks = gpu["output_chunks"];
|
|
this->scratchChunks = gpu["scratch_chunks"];
|
|
checkMessageSize();
|
|
|
|
this->setupChannels(gpus);
|
|
this->setupRemoteBuffers(gpus);
|
|
this->setupSemaphores(gpu);
|
|
this->setupOperations(gpu, contsSrcOffset, constDstOffset);
|
|
}
|
|
|
|
void ExecutionPlan::Impl::lightLoadExecutionPlan(size_t inputSize, size_t outputSize, size_t contsSrcOffset,
|
|
size_t constDstOffset) {
|
|
std::ifstream file(this->planPath);
|
|
json obj = json::parse(file);
|
|
if (this->name != obj["name"]) {
|
|
throw Error("Plan name does not match", ErrorCode::ExecutorError);
|
|
}
|
|
std::string protocol = obj["protocol"];
|
|
if (protocol == "LL") {
|
|
this->isUsingPacket = true;
|
|
}
|
|
const auto& gpus = obj["gpus"];
|
|
const auto& gpu = gpus[rank];
|
|
if (gpu["id"] != rank) {
|
|
throw Error("GPU rank does not match", ErrorCode::ExecutorError);
|
|
}
|
|
|
|
this->inputChunks = gpu["input_chunks"];
|
|
this->outputChunks = gpu["output_chunks"];
|
|
this->scratchChunks = gpu["scratch_chunks"];
|
|
|
|
this->inputSize = inputSize;
|
|
this->outputSize = outputSize;
|
|
|
|
checkMessageSize();
|
|
this->setupOperations(gpu, contsSrcOffset, constDstOffset);
|
|
}
|
|
|
|
void ExecutionPlan::Impl::checkMessageSize() const {
|
|
size_t size = inputSize;
|
|
if (inputSize % bufferAlignment != 0 || outputSize % bufferAlignment != 0 ||
|
|
(inputSize / bufferAlignment) % inputChunks != 0 || (outputSize / bufferAlignment) % outputChunks != 0) {
|
|
throw Error("Input or output size is not aligned with buffer alignment or chunks", ErrorCode::ExecutorError);
|
|
}
|
|
if (this->collective == "allgather") {
|
|
size = outputSize;
|
|
}
|
|
if (size < this->minMessageSize || size > this->maxMessageSize) {
|
|
throw Error("Input or output size is not within the valid range", ErrorCode::ExecutorError);
|
|
}
|
|
}
|
|
|
|
void ExecutionPlan::Impl::parseChannels(const json& gpu, std::vector<ChannelInfo>& channelInfos,
|
|
std::vector<NvlsInfo>& nvlsInfos,
|
|
std::map<std::pair<int, ChannelType>, std::vector<int>>& chanConnectedPeersMap,
|
|
int rank) {
|
|
for (const auto& channel : gpu["channels"]) {
|
|
ChannelType chanType = convertToChannelType(channel["channel_type"]);
|
|
|
|
if (chanType == ChannelType::SWITCH) {
|
|
NvlsInfo info;
|
|
info.bufferType = convertToBufferType(channel["buffer_type"]);
|
|
for (const auto& group : channel["rank_groups"]) {
|
|
info.nChunks = (int)group["size"];
|
|
info.ranks.clear();
|
|
for (int rank : group["ranks"]) {
|
|
info.ranks.push_back(rank);
|
|
}
|
|
nvlsInfos.push_back(info);
|
|
}
|
|
} else {
|
|
ChannelInfo info;
|
|
info.channelType = chanType;
|
|
for (const auto& peer : channel["connected_to"]) {
|
|
info.connectedPeers.push_back(peer);
|
|
chanConnectedPeersMap[{peer, info.channelType}].push_back(rank);
|
|
this->channelCountMap_[{rank, info.channelType}][peer]++;
|
|
}
|
|
channelInfos.push_back(info);
|
|
}
|
|
}
|
|
}
|
|
|
|
void ExecutionPlan::Impl::parseRemoteBuffer(const nlohmann::json& gpus) {
|
|
for (const auto& gpu : gpus) {
|
|
std::unordered_map<ChannelType, int> channelCountMap;
|
|
int gpuRank = gpu["id"];
|
|
auto& bufferInfos = this->remoteBufferInfos_[gpuRank];
|
|
auto& bufferIndexMap = this->bufferIndexMap_[gpuRank];
|
|
for (auto& remoteBuffer : gpu["remote_buffers"]) {
|
|
int bufferId = bufferInfos.size();
|
|
int oriRank = remoteBuffer["rank"];
|
|
BufferType bufferType = convertToBufferType(remoteBuffer["type"]);
|
|
std::vector<ChannelType> accessChannels;
|
|
for (const auto& channel : remoteBuffer["access_channel_types"]) {
|
|
ChannelType chanType = convertToChannelType(channel);
|
|
accessChannels.push_back(chanType);
|
|
bufferIndexMap[{bufferId, chanType}] = channelCountMap[chanType]++;
|
|
}
|
|
BufferInfo info{oriRank, gpuRank, bufferType, accessChannels};
|
|
bufferInfos.push_back(info);
|
|
this->localBufferToSend_[oriRank].push_back(info);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Construct the channel info. Step 1. Flatten MEMORY and PORT channels into separate vectors.
|
|
// Step 2. For each threadblock, construct a vector of channel indexes and keys.
|
|
void ExecutionPlan::Impl::setupChannels(const json& gpus) {
|
|
using mapKey = std::pair<int, ChannelType>;
|
|
std::map<mapKey, std::vector<int>> chanConnectedPeersMap;
|
|
for (const auto& gpu : gpus) {
|
|
int rank = gpu["id"];
|
|
std::vector<ChannelInfo> channelInfos;
|
|
std::vector<NvlsInfo> nvlsInfos;
|
|
this->parseChannels(gpu, channelInfos, nvlsInfos, chanConnectedPeersMap, rank);
|
|
this->channelInfos_[rank] = channelInfos;
|
|
this->nvlsInfos[rank] = nvlsInfos;
|
|
}
|
|
|
|
for (const auto& [key, connectedFrom] : chanConnectedPeersMap) {
|
|
auto [peer, channelType] = key;
|
|
ChannelInfo info;
|
|
info.channelType = channelType;
|
|
info.connectedPeers = connectedFrom;
|
|
this->channelInfosByDstRank_[peer].push_back(info);
|
|
}
|
|
|
|
// setup threadblockChannels
|
|
const auto& gpu = gpus[rank];
|
|
int nthreadblocks = gpu["threadblocks"].size();
|
|
this->threadblockMemoryChannels.resize(nthreadblocks);
|
|
this->threadblockPortChannels.resize(nthreadblocks);
|
|
this->threadblockNvlsChannels.resize(nthreadblocks);
|
|
for (const auto& threadblock : gpu["threadblocks"]) {
|
|
for (const auto& channel : threadblock["channels"]) {
|
|
ChannelType channelType = convertToChannelType(channel["channel_type"]);
|
|
ChannelKey key = {BufferType::NONE, channelType};
|
|
if (channel.contains("buff")) {
|
|
key = {convertToBufferType(channel["buff"]), channelType};
|
|
}
|
|
for (int id : channel["channel_ids"]) {
|
|
if (channelType == ChannelType::MEMORY) {
|
|
this->threadblockMemoryChannels[threadblock["id"]].emplace_back(id);
|
|
} else if (channelType == ChannelType::PORT) {
|
|
this->threadblockPortChannels[threadblock["id"]].emplace_back(id);
|
|
} else if (channelType == ChannelType::SWITCH) {
|
|
this->threadblockNvlsChannels[threadblock["id"]].emplace_back(id);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ExecutionPlan::Impl::setupRemoteBuffers(const json& gpus) {
|
|
this->parseRemoteBuffer(gpus);
|
|
|
|
// setup threadblockBuffers
|
|
const auto& gpu = gpus[rank];
|
|
int nthreadblocks = gpu["threadblocks"].size();
|
|
this->threadblockMemoryChannelBuffers.resize(nthreadblocks);
|
|
this->threadblockPortChannelBuffers.resize(nthreadblocks);
|
|
for (const auto& threadblock : gpu["threadblocks"]) {
|
|
if (!threadblock.contains("remote_buffer_refs")) {
|
|
continue;
|
|
}
|
|
for (const auto& remoteBuffRef : threadblock["remote_buffer_refs"]) {
|
|
ChannelType accessChanType = convertToChannelType(remoteBuffRef["access_channel_type"]);
|
|
if (accessChanType == ChannelType::PORT) {
|
|
for (const auto& bufferId : remoteBuffRef["remote_buffer_ids"]) {
|
|
BufferType type = this->remoteBufferInfos_[rank][bufferId].bufferType;
|
|
this->threadblockPortChannelBuffers[threadblock["id"]].push_back(
|
|
{this->bufferIndexMap_[rank][{bufferId, accessChanType}], type});
|
|
}
|
|
} else if (accessChanType == ChannelType::MEMORY) {
|
|
for (const auto& bufferId : remoteBuffRef["remote_buffer_ids"]) {
|
|
BufferType type = this->remoteBufferInfos_[rank][bufferId].bufferType;
|
|
this->threadblockMemoryChannelBuffers[threadblock["id"]].push_back(
|
|
{this->bufferIndexMap_[rank][{bufferId, accessChanType}], type});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ExecutionPlan::Impl::setupSemaphores(const json& gpu) {
|
|
if (!gpu.contains("semaphores")) {
|
|
return;
|
|
}
|
|
for (const auto& sem : gpu["semaphores"]) {
|
|
SemaphoreInfo info;
|
|
info.initValue = sem["init_value"];
|
|
this->semaphoreInfos.push_back(info);
|
|
}
|
|
}
|
|
|
|
void ExecutionPlan::Impl::setupOperations(const json& gpu, size_t constSrcOffset, size_t constDstOffset) {
|
|
// setup threadblocks and operations
|
|
for (const auto& threadblock : gpu["threadblocks"]) {
|
|
int threadBlockId = threadblock["id"];
|
|
std::vector<Operation> ops;
|
|
for (const auto& op : threadblock["ops"]) {
|
|
Operation operation = {};
|
|
this->setupOperation(op, operation, rank, threadBlockId, constSrcOffset, constDstOffset);
|
|
ops.push_back(operation);
|
|
if (operation.type == OperationType::PIPELINE) {
|
|
for (const auto& innerOp : op["ops"]) {
|
|
Operation pipelineOp = {};
|
|
this->setupOperation(innerOp, pipelineOp, rank, threadBlockId, constSrcOffset, constDstOffset);
|
|
ops.push_back(pipelineOp);
|
|
}
|
|
}
|
|
}
|
|
this->operations.push_back(ops);
|
|
}
|
|
}
|
|
|
|
void ExecutionPlan::Impl::setupOperation(const nlohmann::json& op, Operation& operation, int rank, int threadBlockId,
|
|
size_t constSrcOffset, size_t constDstOffset) {
|
|
auto getConstOffset = [&](BufferType type) -> size_t {
|
|
switch (type) {
|
|
case BufferType::INPUT:
|
|
return constSrcOffset;
|
|
case BufferType::OUTPUT:
|
|
return constDstOffset;
|
|
case BufferType::SCRATCH:
|
|
return 0;
|
|
default:
|
|
throw Error("Invalid buffer type", ErrorCode::ExecutorError);
|
|
}
|
|
};
|
|
|
|
auto getRemoteBufferTypeWithId = [&](int bufferId, int threadBlockId, ChannelType channelType) -> BufferType {
|
|
if (channelType == ChannelType::MEMORY) {
|
|
return this->threadblockMemoryChannelBuffers[threadBlockId][bufferId].second;
|
|
}
|
|
if (channelType == ChannelType::PORT) {
|
|
return this->threadblockPortChannelBuffers[threadBlockId][bufferId].second;
|
|
}
|
|
throw Error("Invalid channel type", ErrorCode::ExecutorError);
|
|
};
|
|
|
|
uint32_t tbId = 0;
|
|
uint32_t tbgSize = 1;
|
|
|
|
operation.type = static_cast<mscclpp::OperationType>(getOpType(op["name"]));
|
|
if (op.contains("channel_type")) {
|
|
operation.channelType = convertToChannelType(op["channel_type"]);
|
|
}
|
|
if (op.contains("channel_ids")) {
|
|
operation.nChannels = op["channel_ids"].size();
|
|
for (uint32_t i = 0; i < op["channel_ids"].size(); i++) {
|
|
operation.channelIndexes[i] = op["channel_ids"][i];
|
|
}
|
|
}
|
|
if (op.contains("tbg_info")) {
|
|
tbId = op["tbg_info"]["tb_id"];
|
|
tbgSize = op["tbg_info"]["tbg_size"];
|
|
}
|
|
if (op.contains("src_buff")) {
|
|
operation.nInputs = op["src_buff"].size();
|
|
for (int i = 0; i < operation.nInputs; i++) {
|
|
auto& buff = op["src_buff"][i];
|
|
size_t constOffset = 0;
|
|
BufferType bufferType = BufferType::NONE;
|
|
if (buff.contains("type")) {
|
|
bufferType = convertToBufferType(buff["type"]);
|
|
operation.inputBufferRefs[i].type = bufferType;
|
|
}
|
|
if (buff.contains("buffer_id")) {
|
|
operation.inputBufferRefs[i].id = buff["buffer_id"];
|
|
bufferType = getRemoteBufferTypeWithId(buff["buffer_id"], threadBlockId, operation.channelType);
|
|
constOffset = getConstOffset(bufferType);
|
|
}
|
|
if (buff.contains("switch_channel_id")) {
|
|
int switchChannelIdx = this->threadblockNvlsChannels[threadBlockId][buff["switch_channel_id"]];
|
|
bufferType = this->nvlsInfos[rank][switchChannelIdx].bufferType;
|
|
constOffset = getConstOffset(bufferType);
|
|
operation.nvlsInputBufferType = bufferType;
|
|
operation.nvlsInputIndex = buff["switch_channel_id"];
|
|
}
|
|
size_t inputOffset = this->getOffset(this->inputSize, this->outputSize, buff["index"], bufferType) + constOffset;
|
|
size_t inputBufferSize = this->getBufferSize(this->inputSize, this->outputSize, buff["index"], buff["size"]);
|
|
inputOffset += calcOffset(inputBufferSize, tbId, tbgSize);
|
|
inputBufferSize = calcSize(inputBufferSize, tbId, tbgSize);
|
|
operation.inputOffsets[i] = inputOffset;
|
|
operation.inputBufferSizes[i] = inputBufferSize;
|
|
}
|
|
}
|
|
if (op.contains("dst_buff")) {
|
|
operation.nOutputs = op["dst_buff"].size();
|
|
for (int i = 0; i < operation.nOutputs; i++) {
|
|
auto& buff = op["dst_buff"][i];
|
|
size_t constOffset = 0;
|
|
BufferType bufferType = BufferType::NONE;
|
|
if (buff.contains("type")) {
|
|
bufferType = convertToBufferType(buff["type"]);
|
|
operation.outputBufferRefs[i].type = bufferType;
|
|
}
|
|
if (buff.contains("buffer_id")) {
|
|
operation.outputBufferRefs[i].id = buff["buffer_id"];
|
|
bufferType = getRemoteBufferTypeWithId(buff["buffer_id"], threadBlockId, operation.channelType);
|
|
constOffset = getConstOffset(bufferType);
|
|
}
|
|
if (buff.contains("switch_channel_id")) {
|
|
int switchChannelIdx = this->threadblockNvlsChannels[threadBlockId][buff["switch_channel_id"]];
|
|
bufferType = this->nvlsInfos[rank][switchChannelIdx].bufferType;
|
|
constOffset = getConstOffset(bufferType);
|
|
operation.nvlsOutputBufferType = bufferType;
|
|
operation.nvlsOutputIndex = buff["switch_channel_id"];
|
|
}
|
|
size_t outputOffset = this->getOffset(this->inputSize, this->outputSize, buff["index"], bufferType) + constOffset;
|
|
size_t outputBufferSize = this->getBufferSize(this->inputSize, this->outputSize, buff["index"], buff["size"]);
|
|
outputOffset += calcOffset(outputBufferSize, tbId, tbgSize);
|
|
outputBufferSize = calcSize(outputBufferSize, tbId, tbgSize);
|
|
operation.outputOffsets[i] = outputOffset;
|
|
operation.outputBufferSizes[i] = outputBufferSize;
|
|
}
|
|
}
|
|
if (op.contains("barrier_id")) {
|
|
operation.deviceSyncerIndex = op["barrier_id"];
|
|
}
|
|
if (op.contains("num_threadblocks")) {
|
|
operation.nThreadBlocks = op["num_threadblocks"];
|
|
}
|
|
if (op.contains("semaphore_ids")) {
|
|
operation.nDeviceSemaphores = op["semaphore_ids"].size();
|
|
for (uint32_t id = 0; id < operation.nDeviceSemaphores; id++) {
|
|
operation.deviceSemaphoreIds[id] = op["semaphore_ids"][id];
|
|
}
|
|
}
|
|
if (op.contains("iter_context")) {
|
|
operation.unitSize = op["iter_context"]["unit_size"];
|
|
operation.nOperations = op["ops"].size();
|
|
int nChunks = op["iter_context"]["num_chunks"];
|
|
size_t sizes = nChunks * getUpperBoundChunkSize(this->inputSize, this->outputSize);
|
|
operation.nIterations = (sizes + (operation.unitSize - 1)) / operation.unitSize;
|
|
}
|
|
}
|
|
|
|
std::pair<size_t, uint32_t> ExecutionPlan::Impl::getSizeAndChunks(size_t inputSize, size_t outputSize) const {
|
|
std::pair<size_t, uint32_t> sizePerRank;
|
|
if (this->inputChunks == 0 && this->outputChunks == 0) {
|
|
throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
|
|
} else if (this->inputChunks != 0 && this->outputChunks != 0) {
|
|
if (inputSize / this->inputChunks != outputSize / this->outputChunks)
|
|
throw mscclpp::Error("Size per chunks inconsistent: inputSize " + std::to_string(inputSize) + " inputChunks " +
|
|
std::to_string(this->inputChunks) + " outputSize " + std::to_string(outputSize) +
|
|
" outputChunks " + std::to_string(this->outputChunks),
|
|
mscclpp::ErrorCode::ExecutorError);
|
|
else
|
|
sizePerRank = std::make_pair(inputSize, this->inputChunks);
|
|
} else if (this->inputChunks != 0) {
|
|
sizePerRank = std::make_pair(inputSize, this->inputChunks);
|
|
} else if (this->outputChunks != 0) {
|
|
sizePerRank = std::make_pair(outputSize, this->outputChunks);
|
|
}
|
|
return sizePerRank;
|
|
}
|
|
|
|
size_t ExecutionPlan::Impl::calcOffset(size_t size, uint32_t index, uint32_t slices) const {
|
|
uint32_t nelems = size / (this->bufferAlignment * sizeof(uint8_t));
|
|
uint32_t minNelems = nelems / slices;
|
|
uint32_t remainder = nelems % slices;
|
|
uint32_t offset = index * minNelems + (index % nelems < remainder ? index % nelems : remainder);
|
|
return static_cast<size_t>(offset) * this->bufferAlignment;
|
|
}
|
|
|
|
size_t ExecutionPlan::Impl::calcSize(size_t size, uint32_t index, uint32_t slices) const {
|
|
uint32_t beginOff = calcOffset(size, index, slices);
|
|
uint32_t endOff = calcOffset(size, index + 1, slices);
|
|
return endOff - beginOff;
|
|
}
|
|
|
|
size_t ExecutionPlan::Impl::getOffset(size_t inputSize, size_t outputSize, uint32_t chunkIndex,
|
|
BufferType bufferType) const {
|
|
auto rankSizeAndChunks = getSizeAndChunks(inputSize, outputSize);
|
|
uint32_t nChunks = rankSizeAndChunks.second;
|
|
uint32_t chunkSize = (rankSizeAndChunks.first + nChunks - 1) / nChunks;
|
|
uint32_t scratchChunkSize = this->calMaxScratchChunkSize(PREDFINED_SCRATCH_SIZE);
|
|
|
|
// Reuse scratch buffer for large input/output chunks
|
|
if (bufferType == BufferType::SCRATCH && this->reuseResources && scratchChunkSize < chunkSize) {
|
|
return chunkIndex * this->calMaxScratchChunkSize(PREDFINED_SCRATCH_SIZE);
|
|
}
|
|
|
|
return calcOffset(rankSizeAndChunks.first, chunkIndex, nChunks);
|
|
}
|
|
|
|
size_t ExecutionPlan::Impl::getBufferSize(size_t inputSize, size_t outputSize, uint32_t index, uint32_t nChunks) const {
|
|
uint32_t beginOff = getOffset(inputSize, outputSize, index);
|
|
uint32_t endOff = getOffset(inputSize, outputSize, index + nChunks);
|
|
return endOff - beginOff;
|
|
}
|
|
|
|
size_t ExecutionPlan::Impl::getUpperBoundChunkSize(size_t inputSize, size_t outputSize) const {
|
|
size_t nInputChunks = this->inputChunks;
|
|
size_t nOutputChunks = this->outputChunks;
|
|
if (nInputChunks != 0) {
|
|
size_t nelems = inputSize / this->bufferAlignment;
|
|
return (nelems + nInputChunks - 1) / nInputChunks * this->bufferAlignment;
|
|
}
|
|
if (nOutputChunks != 0) {
|
|
size_t nelems = outputSize / this->bufferAlignment;
|
|
return (nelems + nOutputChunks - 1) / nOutputChunks * this->bufferAlignment;
|
|
}
|
|
throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
|
|
}
|
|
|
|
void ExecutionPlan::Impl::reset() {
|
|
this->operations.clear();
|
|
this->nvlsInfos.clear();
|
|
this->threadblockMemoryChannels.clear();
|
|
this->threadblockPortChannels.clear();
|
|
this->threadblockNvlsChannels.clear();
|
|
this->threadblockMemoryChannelBuffers.clear();
|
|
this->threadblockPortChannelBuffers.clear();
|
|
this->semaphoreInfos.clear();
|
|
|
|
this->channelInfos_.clear();
|
|
this->channelCountMap_.clear();
|
|
this->channelInfosByDstRank_.clear();
|
|
this->remoteBufferInfos_.clear();
|
|
this->localBufferToSend_.clear();
|
|
this->bufferIndexMap_.clear();
|
|
}
|
|
|
|
void ExecutionPlan::Impl::operationsReset() { this->operations.clear(); }
|
|
|
|
ExecutionPlan::ExecutionPlan(const std::string& planPath, int rank) : impl_(std::make_shared<Impl>(planPath, rank)) {}
|
|
|
|
const std::string& ExecutionPlan::name() const { return this->impl_->name; }
|
|
|
|
const std::string& ExecutionPlan::collective() const { return this->impl_->collective; }
|
|
|
|
size_t ExecutionPlan::minMessageSize() const { return this->impl_->minMessageSize; }
|
|
|
|
size_t ExecutionPlan::maxMessageSize() const { return this->impl_->maxMessageSize; }
|
|
|
|
bool ExecutionPlan::isInPlace() const { return this->impl_->isInPlace; }
|
|
|
|
} // namespace mscclpp
|