mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 08:50:21 +00:00
Merge branch 'main' into caiorocha/support_tbg_pipeline
This commit is contained in:
42
.azure-pipelines/templates/ut-executor.yml
Normal file
42
.azure-pipelines/templates/ut-executor.yml
Normal file
@@ -0,0 +1,42 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: ${{ parameters.platform }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: ExecutorTest
|
||||
displayName: Run executor tests
|
||||
remoteScript: |
|
||||
python3 -m pip install .
|
||||
PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
|
||||
TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -148,3 +148,24 @@ jobs:
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
platform: rocm
|
||||
gpuArch: gfx942
|
||||
|
||||
- job: UnitTestExecutor
|
||||
timeoutInMinutes: 60
|
||||
displayName: Test DSL Executor
|
||||
pool:
|
||||
name: msccl-ci-h100
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-executor.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
gpuArch: '90'
|
||||
@@ -389,7 +389,7 @@ struct EndpointConfig {
|
||||
};
|
||||
|
||||
static constexpr int DefaultPort = -1;
|
||||
static constexpr int DefaultGidIndex = 0;
|
||||
static constexpr int DefaultGidIndex = -1;
|
||||
static constexpr int DefaultMaxCqSize = 1024;
|
||||
static constexpr int DefaultMaxCqPollNum = 1;
|
||||
static constexpr int DefaultMaxSendWr = 8192;
|
||||
@@ -418,7 +418,7 @@ struct EndpointConfig {
|
||||
/// Constructor.
|
||||
/// @param deviceIndex Device index.
|
||||
/// @param port Port number.
|
||||
/// @param gidIndex GID index.
|
||||
/// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
|
||||
/// @param maxCqSize Maximum send completion queue size.
|
||||
/// @param maxCqPollNum Maximum send completion queue poll count.
|
||||
/// @param maxSendWr Maximum outstanding send work requests.
|
||||
|
||||
@@ -115,6 +115,10 @@ class Env {
|
||||
/// Default is false.
|
||||
const bool forceDisableGdr;
|
||||
|
||||
/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
|
||||
/// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
|
||||
const int ibGidIndex;
|
||||
|
||||
private:
|
||||
Env();
|
||||
|
||||
|
||||
@@ -23,7 +23,8 @@ void register_env(nb::module_& m) {
|
||||
.def_ro("ibv_mode", &Env::ibvMode)
|
||||
.def_ro("cache_dir", &Env::cacheDir)
|
||||
.def_ro("npkit_dump_dir", &Env::npkitDumpDir)
|
||||
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
|
||||
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
|
||||
.def_ro("ib_gid_index", &Env::ibGidIndex);
|
||||
|
||||
m.def("env", &env);
|
||||
}
|
||||
|
||||
@@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
|
||||
if (config_.ib.gidIndex < 0) {
|
||||
config_.ib.gidIndex = env()->ibGidIndex;
|
||||
}
|
||||
|
||||
int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
|
||||
|
||||
ibQp_ = contextImpl.getIbContext(config_.transport)
|
||||
|
||||
@@ -66,7 +66,8 @@ Env::Env()
|
||||
forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
|
||||
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
|
||||
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
|
||||
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
|
||||
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
|
||||
ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}
|
||||
|
||||
std::shared_ptr<Env> env() {
|
||||
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
|
||||
@@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
|
||||
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
|
||||
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
|
||||
logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
|
||||
logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
|
||||
}
|
||||
return globalEnv;
|
||||
}
|
||||
|
||||
@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
|
||||
else
|
||||
return 25000;
|
||||
#else
|
||||
cudaDeviceProp dev_prop;
|
||||
int clockRate;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
|
||||
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
|
||||
return dev_prop.clockRate;
|
||||
MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
|
||||
return clockRate;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
87
test/executor-tests/algos/reduce.py
Normal file
87
test/executor-tests/algos/reduce.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Reduce Test
|
||||
|
||||
This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
|
||||
operations. It implements a 2-GPU allreduce using the Simple protocol
|
||||
with instruction fusion enabled.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def reduce(name, num_threads_per_block, min_message_size, max_message_size):
|
||||
collective = AllReduce(2, 2, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
2,
|
||||
protocol="Simple",
|
||||
instr_fusion=True,
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=False,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
|
||||
first_rank = Rank(0)
|
||||
second_rank = Rank(1)
|
||||
first_ch = MemoryChannel(1, 0)
|
||||
second_ch = MemoryChannel(0, 1)
|
||||
first_input_buffer = first_rank.get_input_buffer()
|
||||
second_input_buffer = second_rank.get_input_buffer()
|
||||
first_scratch_buffer = Buffer(0, 4)
|
||||
second_scratch_buffer = Buffer(1, 4)
|
||||
|
||||
# Each rank copies its input chunks to scratch to prepare for remote access
|
||||
first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0)
|
||||
second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0)
|
||||
|
||||
# Signal and wait to ensure scratch data is visible to the remote rank
|
||||
first_ch.signal(tb=0)
|
||||
second_ch.signal(tb=0)
|
||||
|
||||
first_ch.wait(tb=0)
|
||||
second_ch.wait(tb=0)
|
||||
|
||||
# Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
|
||||
first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0)
|
||||
first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0)
|
||||
|
||||
# Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
|
||||
first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0)
|
||||
first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0)
|
||||
first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0)
|
||||
|
||||
# Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
|
||||
second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0)
|
||||
second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0)
|
||||
second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0)
|
||||
|
||||
# Final signal/wait to ensure all reduced data is consistent across both ranks
|
||||
first_ch.signal(tb=0)
|
||||
second_ch.signal(tb=0)
|
||||
|
||||
first_ch.wait(tb=0)
|
||||
second_ch.wait(tb=0)
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
91
test/executor-tests/algos/reduce_nvls.py
Normal file
91
test/executor-tests/algos/reduce_nvls.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Reduce NVLS Test
|
||||
|
||||
This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
|
||||
NVLS SwitchChannels. Each GPU reduces its chunk via the
|
||||
NVSwitch and broadcasts the result to all other GPUs.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
|
||||
chunksperloop = 1
|
||||
collective = AllReduce(gpu_size, chunksperloop, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
gpu_size,
|
||||
instances=1,
|
||||
protocol="Simple",
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=False,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Creating Channels
|
||||
nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
|
||||
channels = {}
|
||||
for gpu in range(gpu_size):
|
||||
for peer in range(gpu_size):
|
||||
if peer != gpu:
|
||||
channels[(peer, gpu)] = MemoryChannel(peer, gpu)
|
||||
|
||||
# Synchronization to Ensure all the GPUs are Ready
|
||||
for gpu in range(gpu_size):
|
||||
src_rank = gpu
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
|
||||
|
||||
# Reducing and Storing the data
|
||||
for gpu in range(gpu_size):
|
||||
buffer_offset = gpu
|
||||
rank = Rank(gpu)
|
||||
input_buffer = rank.get_input_buffer()
|
||||
nvls_chan.at_rank(gpu).reduce(
|
||||
buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
|
||||
)
|
||||
nvls_chan.at_rank(gpu).broadcast(
|
||||
src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
|
||||
)
|
||||
|
||||
# Synchronization to Ensure the GPUs finished
|
||||
for gpu in range(gpu_size):
|
||||
src_rank = gpu
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
94
test/executor-tests/algos/reduce_nvls_pipeline.py
Normal file
94
test/executor-tests/algos/reduce_nvls_pipeline.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Reduce NVLS Pipeline Test
|
||||
|
||||
This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a
|
||||
pipeline context using SwitchChannel. Each GPU reduces
|
||||
its chunk via the NVSwitch and broadcasts the result, processing data
|
||||
in a pipelined loop over fixed-size iterations.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
from mscclpp.language.loop import LoopIterationContext
|
||||
|
||||
|
||||
def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
|
||||
chunksperloop = 1
|
||||
collective = AllReduce(gpu_size, chunksperloop, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
gpu_size,
|
||||
instances=1,
|
||||
protocol="Simple",
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=False,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Creating Channels
|
||||
nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
|
||||
channels = {}
|
||||
for gpu in range(gpu_size):
|
||||
for peer in range(gpu_size):
|
||||
if peer != gpu:
|
||||
channels[(peer, gpu)] = MemoryChannel(peer, gpu)
|
||||
|
||||
# Synchronization to Ensure all the GPUs are Ready
|
||||
for gpu in range(gpu_size):
|
||||
src_rank = gpu
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
|
||||
|
||||
# Pipeline Reducing and Storing the data
|
||||
with LoopIterationContext(unit=2**20, num_chunks=1):
|
||||
for gpu in range(gpu_size):
|
||||
buffer_offset = gpu
|
||||
rank = Rank(gpu)
|
||||
input_buffer = rank.get_input_buffer()
|
||||
nvls_chan.at_rank(gpu).reduce(
|
||||
buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
|
||||
)
|
||||
nvls_chan.at_rank(gpu).broadcast(
|
||||
src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
|
||||
)
|
||||
|
||||
# Synchronization to Ensure the GPUs finished
|
||||
for gpu in range(gpu_size):
|
||||
src_rank = gpu
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
|
||||
for peer in range(gpu_size):
|
||||
if peer != src_rank:
|
||||
dst_rank = peer
|
||||
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
73
test/executor-tests/algos/reduce_pack.py
Normal file
73
test/executor-tests/algos/reduce_pack.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Reduce Pack Test
|
||||
|
||||
This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
|
||||
operations. It implements a 2-GPU allreduce with the LL (low-latency)
|
||||
packet protocol.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def reduce_pack(name, num_threads_per_block, min_message_size, max_message_size):
|
||||
chunksperloop = 1
|
||||
gpu_size = 2
|
||||
collective = AllReduce(gpu_size, chunksperloop, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
gpu_size,
|
||||
protocol="LL",
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=True,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
|
||||
first_rank = Rank(0)
|
||||
second_rank = Rank(1)
|
||||
first_ch = MemoryChannel(1, 0)
|
||||
second_ch = MemoryChannel(0, 1)
|
||||
first_input_buffer = first_rank.get_input_buffer()
|
||||
second_input_buffer = second_rank.get_input_buffer()
|
||||
first_scratch_buffer = Buffer(0, 3)
|
||||
second_scratch_buffer = Buffer(1, 3)
|
||||
|
||||
# Each rank sends its input chunk as packets to the other rank's scratch buffer
|
||||
first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb=0)
|
||||
second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb=0)
|
||||
|
||||
# Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
|
||||
first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb=1, packet=True)
|
||||
first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb=1)
|
||||
|
||||
# Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
|
||||
second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb=1, packet=True)
|
||||
second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb=1)
|
||||
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb=1)
|
||||
|
||||
# Both ranks unpack the final reduced packets from scratch into their output buffers
|
||||
first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb=2)
|
||||
second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb=2)
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
reduce_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
77
test/executor-tests/algos/reduce_pack_tbg.py
Normal file
77
test/executor-tests/algos/reduce_pack_tbg.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Reduce Pack Thread Block Group Test
|
||||
|
||||
This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
|
||||
operations using thread block groups. It implements a 2-GPU allreduce
|
||||
with the LL (low-latency) packet protocol, where multiple thread
|
||||
blocks cooperate on each phase.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def reduce_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
|
||||
chunksperloop = 1
|
||||
gpu_size = 2
|
||||
collective = AllReduce(gpu_size, chunksperloop, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
gpu_size,
|
||||
protocol="LL",
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=True,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
|
||||
first_rank = Rank(0)
|
||||
second_rank = Rank(1)
|
||||
first_ch = MemoryChannel(1, 0)
|
||||
second_ch = MemoryChannel(0, 1)
|
||||
first_input_buffer = first_rank.get_input_buffer()
|
||||
second_input_buffer = second_rank.get_input_buffer()
|
||||
first_scratch_buffer = Buffer(0, 3)
|
||||
second_scratch_buffer = Buffer(1, 3)
|
||||
tbg = []
|
||||
for i in range(3):
|
||||
tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
|
||||
|
||||
# Each rank sends its input chunk as packets to the other rank's scratch buffer
|
||||
first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb_group=tbg[0])
|
||||
second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb_group=tbg[0])
|
||||
|
||||
# Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
|
||||
first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb_group=tbg[1], packet=True)
|
||||
first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb_group=tbg[1])
|
||||
|
||||
# Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
|
||||
second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb_group=tbg[1], packet=True)
|
||||
second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb_group=tbg[1])
|
||||
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb_group=tbg[1])
|
||||
|
||||
# Both ranks unpack the final reduced packets from scratch into their output buffers
|
||||
first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[2])
|
||||
second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
reduce_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
99
test/executor-tests/algos/reduce_tbg.py
Normal file
99
test/executor-tests/algos/reduce_tbg.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Reduce Thread Block Group Test
|
||||
|
||||
This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
|
||||
operations using thread block groups. It implements a 2-GPU allreduce
|
||||
with the Simple protocol and instruction fusion, where multiple thread
|
||||
blocks cooperate on each operation.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def reduce_tbg(name, num_threads_per_block, min_message_size, max_message_size):
|
||||
collective = AllReduce(2, 2, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
2,
|
||||
protocol="Simple",
|
||||
instr_fusion=True,
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=False,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Setup ranks, memory channels, input buffers, and scratch buffers for 2-GPU AllReduce
|
||||
first_rank = Rank(0)
|
||||
second_rank = Rank(1)
|
||||
first_ch_tb0 = MemoryChannel(1, 0)
|
||||
first_ch_tb1 = MemoryChannel(1, 0)
|
||||
second_ch_tb0 = MemoryChannel(0, 1)
|
||||
second_ch_tb1 = MemoryChannel(0, 1)
|
||||
first_input_buffer = first_rank.get_input_buffer()
|
||||
second_input_buffer = second_rank.get_input_buffer()
|
||||
first_scratch_buffer = Buffer(0, 4)
|
||||
second_scratch_buffer = Buffer(1, 4)
|
||||
tbg = ThreadBlockGroup(tb_list=[0, 1])
|
||||
|
||||
# Each rank copies its input chunks to scratch to prepare for remote access
|
||||
first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb_group=tbg)
|
||||
second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb_group=tbg)
|
||||
|
||||
# Signal and wait on both TBs to ensure scratch data is visible to the remote rank
|
||||
first_ch_tb0.signal(tb=0)
|
||||
first_ch_tb1.signal(tb=1)
|
||||
second_ch_tb0.signal(tb=0)
|
||||
second_ch_tb1.signal(tb=1)
|
||||
|
||||
first_ch_tb0.wait(tb=0)
|
||||
first_ch_tb1.wait(tb=1)
|
||||
second_ch_tb0.wait(tb=0)
|
||||
second_ch_tb1.wait(tb=1)
|
||||
|
||||
# Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
|
||||
first_ch_tb0.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb_group=tbg)
|
||||
first_ch_tb0.put(second_input_buffer[0:1], first_input_buffer[0:1], tb_group=tbg)
|
||||
|
||||
# Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
|
||||
first_ch_tb0.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg)
|
||||
first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb_group=tbg)
|
||||
first_ch_tb0.put(second_input_buffer[1:2], first_input_buffer[1:2], tb_group=tbg)
|
||||
|
||||
# Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
|
||||
second_ch_tb0.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb_group=tbg)
|
||||
second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb_group=tbg)
|
||||
second_ch_tb0.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb_group=tbg)
|
||||
|
||||
# Final signal/wait on both TBs to ensure all reduced data is consistent across both ranks
|
||||
first_ch_tb0.signal(tb=0)
|
||||
first_ch_tb1.signal(tb=1)
|
||||
second_ch_tb0.signal(tb=0)
|
||||
second_ch_tb1.signal(tb=1)
|
||||
|
||||
first_ch_tb0.wait(tb=0)
|
||||
first_ch_tb1.wait(tb=1)
|
||||
second_ch_tb0.wait(tb=0)
|
||||
second_ch_tb1.wait(tb=1)
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
reduce_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
67
test/executor-tests/algos/transfer_pack.py
Normal file
67
test/executor-tests/algos/transfer_pack.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Transfer Pack Test
|
||||
|
||||
This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
|
||||
PUT_PACKETS operations. It implements a 2-GPU allgather with the LL
|
||||
(low-latency) packet protocol.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def transfer_pack(name, num_threads_per_block, min_message_size, max_message_size):
|
||||
chunksperloop = 1
|
||||
gpu_size = 2
|
||||
collective = AllGather(gpu_size, chunksperloop, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
gpu_size,
|
||||
protocol="LL",
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=True,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Setup ranks, channels, output and scratch buffers for 2-GPU allgather
|
||||
first_rank = Rank(0)
|
||||
second_rank = Rank(1)
|
||||
first_ch = MemoryChannel(1, 0)
|
||||
second_ch = MemoryChannel(0, 1)
|
||||
first_output_buffer = first_rank.get_output_buffer()
|
||||
second_output_buffer = second_rank.get_output_buffer()
|
||||
first_scratch_buffer = Buffer(0, 2)
|
||||
second_scratch_buffer = Buffer(1, 2)
|
||||
|
||||
# Rank 0 sends its output chunk as packets to rank 1's scratch buffer
|
||||
first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb=0)
|
||||
|
||||
# Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
|
||||
second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0)
|
||||
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1)
|
||||
|
||||
# Both ranks unpack received packets from scratch into their output buffers
|
||||
first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1)
|
||||
second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2)
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
transfer_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
71
test/executor-tests/algos/transfer_pack_tbg.py
Normal file
71
test/executor-tests/algos/transfer_pack_tbg.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Transfer Pack Thread Block Group Test
|
||||
|
||||
This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
|
||||
PUT_PACKETS operations using thread block groups. It implements a 2-GPU
|
||||
allgather with the LL (low-latency) packet protocol, where multiple
|
||||
thread blocks cooperate on each phase.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from mscclpp.language.channel import *
|
||||
from mscclpp.language.rank import *
|
||||
from mscclpp.language.general import *
|
||||
from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def transfer_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
|
||||
chunksperloop = 1
|
||||
gpu_size = 2
|
||||
collective = AllGather(gpu_size, chunksperloop, True)
|
||||
with CollectiveProgram(
|
||||
name,
|
||||
collective,
|
||||
gpu_size,
|
||||
protocol="LL",
|
||||
num_threads_per_block=num_threads_per_block,
|
||||
use_double_scratch_buffer=True,
|
||||
min_message_size=min_message_size,
|
||||
max_message_size=max_message_size,
|
||||
):
|
||||
# Setup ranks, channels, output and scratch buffers for 2-GPU allgather
|
||||
first_rank = Rank(0)
|
||||
second_rank = Rank(1)
|
||||
first_ch = MemoryChannel(1, 0)
|
||||
second_ch = MemoryChannel(0, 1)
|
||||
first_output_buffer = first_rank.get_output_buffer()
|
||||
second_output_buffer = second_rank.get_output_buffer()
|
||||
first_scratch_buffer = Buffer(0, 2)
|
||||
second_scratch_buffer = Buffer(1, 2)
|
||||
tbg = []
|
||||
for i in range(3):
|
||||
tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
|
||||
|
||||
# Rank 0 sends its output chunk as packets to rank 1's scratch buffer
|
||||
first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb_group=tbg[0])
|
||||
|
||||
# Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
|
||||
second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb_group=tbg[0])
|
||||
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg[1])
|
||||
|
||||
# Both ranks unpack received packets from scratch into their output buffers
|
||||
first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[1])
|
||||
second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
|
||||
|
||||
print(JSON())
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
transfer_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|
||||
389
test/executor-tests/execution-plans/reduce.json
Normal file
389
test/executor-tests/execution-plans/reduce.json
Normal file
@@ -0,0 +1,389 @@
|
||||
{
|
||||
"name": "reduce",
|
||||
"collective": "allreduce",
|
||||
"protocol": "Simple",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 4,
|
||||
"output_chunks": 4,
|
||||
"scratch_chunks": 4,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rres",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 1,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum"
|
||||
},
|
||||
{
|
||||
"name": "get",
|
||||
"src_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "res",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 1,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "i",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 4,
|
||||
"output_chunks": 4,
|
||||
"scratch_chunks": 4,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 2
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rre",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "put",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 0,
|
||||
"type": "i",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": false,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
246
test/executor-tests/execution-plans/reduce_nvls.json
Normal file
246
test/executor-tests/execution-plans/reduce_nvls.json
Normal file
@@ -0,0 +1,246 @@
|
||||
{
|
||||
"name": "allreduce_nvls",
|
||||
"collective": "allreduce",
|
||||
"protocol": "Simple",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 0,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "glres",
|
||||
"src_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "switch",
|
||||
"reduce_op": "sum"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"buffer_type": "i",
|
||||
"rank_groups": [
|
||||
{
|
||||
"size": 2,
|
||||
"ranks": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 0,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "glres",
|
||||
"src_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "switch",
|
||||
"reduce_op": "sum"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"buffer_type": "i",
|
||||
"rank_groups": [
|
||||
{
|
||||
"size": 2,
|
||||
"ranks": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": false,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
264
test/executor-tests/execution-plans/reduce_nvls_pipeline.json
Normal file
264
test/executor-tests/execution-plans/reduce_nvls_pipeline.json
Normal file
@@ -0,0 +1,264 @@
|
||||
{
|
||||
"name": "allreduce_nvls_pipeline",
|
||||
"collective": "allreduce",
|
||||
"protocol": "Simple",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 0,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "pipeline",
|
||||
"iter_context": {
|
||||
"unit_size": 1048576,
|
||||
"num_chunks": 1
|
||||
},
|
||||
"ops": [
|
||||
{
|
||||
"name": "glres",
|
||||
"src_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "switch",
|
||||
"reduce_op": "sum"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"buffer_type": "i",
|
||||
"rank_groups": [
|
||||
{
|
||||
"size": 2,
|
||||
"ranks": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 0,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "pipeline",
|
||||
"iter_context": {
|
||||
"unit_size": 1048576,
|
||||
"num_chunks": 1
|
||||
},
|
||||
"ops": [
|
||||
{
|
||||
"name": "glres",
|
||||
"src_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"switch_channel_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "switch",
|
||||
"reduce_op": "sum"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxsignal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rlxwait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"channel_type": "switch",
|
||||
"buffer_type": "i",
|
||||
"rank_groups": [
|
||||
{
|
||||
"size": 2,
|
||||
"ranks": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": false,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
297
test/executor-tests/execution-plans/reduce_pack.json
Normal file
297
test/executor-tests/execution-plans/reduce_pack.json
Normal file
@@ -0,0 +1,297 @@
|
||||
{
|
||||
"name": "reduce_pack",
|
||||
"collective": "allreduce",
|
||||
"protocol": "LL",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 3,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "respkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 3,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "recspkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 0,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": true,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
576
test/executor-tests/execution-plans/reduce_pack_tbg.json
Normal file
576
test/executor-tests/execution-plans/reduce_pack_tbg.json
Normal file
@@ -0,0 +1,576 @@
|
||||
{
|
||||
"name": "reduce_pack_tbg",
|
||||
"collective": "allreduce",
|
||||
"protocol": "LL",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 3,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"ops": [
|
||||
{
|
||||
"name": "respkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"ops": [
|
||||
{
|
||||
"name": "respkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 2,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 3,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"ops": [
|
||||
{
|
||||
"name": "recspkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"ops": [
|
||||
{
|
||||
"name": "recspkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 0,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": true,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
773
test/executor-tests/execution-plans/reduce_tbg.json
Normal file
773
test/executor-tests/execution-plans/reduce_tbg.json
Normal file
@@ -0,0 +1,773 @@
|
||||
{
|
||||
"name": "reduce_tbg",
|
||||
"collective": "allreduce",
|
||||
"protocol": "Simple",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 4,
|
||||
"output_chunks": 4,
|
||||
"scratch_chunks": 4,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rres",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 1,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "get",
|
||||
"src_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "res",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 1,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rres",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 1,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "get",
|
||||
"src_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_ids": [
|
||||
1
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "res",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
},
|
||||
{
|
||||
"buffer_id": 1,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
1,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1,
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "i",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 4,
|
||||
"output_chunks": 4,
|
||||
"scratch_chunks": 4,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rre",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "put",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 0,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "rre",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
},
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"reduce_op": "sum",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "copy",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "i",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "put",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 2,
|
||||
"size": 2
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "signal",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
},
|
||||
{
|
||||
"name": "wait",
|
||||
"channel_ids": [
|
||||
0
|
||||
],
|
||||
"channel_type": "memory"
|
||||
},
|
||||
{
|
||||
"name": "nop"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
1,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 0,
|
||||
"type": "i",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": false,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
216
test/executor-tests/execution-plans/transfer_pack.json
Normal file
216
test/executor-tests/execution-plans/transfer_pack.json
Normal file
@@ -0,0 +1,216 @@
|
||||
{
|
||||
"name": "transfer_pack",
|
||||
"collective": "allgather",
|
||||
"protocol": "LL",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 1,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 2,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 1,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 2,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "cpkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "rppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory"
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 0,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": true,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
406
test/executor-tests/execution-plans/transfer_pack_tbg.json
Normal file
406
test/executor-tests/execution-plans/transfer_pack_tbg.json
Normal file
@@ -0,0 +1,406 @@
|
||||
{
|
||||
"name": "transfer_pack_tbg",
|
||||
"collective": "allgather",
|
||||
"protocol": "LL",
|
||||
"inplace": true,
|
||||
"reuse_resources": false,
|
||||
"gpus": [
|
||||
{
|
||||
"id": 0,
|
||||
"input_chunks": 1,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 2,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "ppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 1,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"input_chunks": 1,
|
||||
"output_chunks": 2,
|
||||
"scratch_chunks": 2,
|
||||
"threadblocks": [
|
||||
{
|
||||
"id": 0,
|
||||
"ops": [
|
||||
{
|
||||
"name": "cpkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"ops": [
|
||||
{
|
||||
"name": "cpkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"ops": [
|
||||
{
|
||||
"name": "rppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"ops": [
|
||||
{
|
||||
"name": "rppkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"buffer_id": 0,
|
||||
"index": 1,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"channel_type": "memory",
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"channel_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffer_refs": [
|
||||
{
|
||||
"access_channel_type": "memory",
|
||||
"remote_buffer_ids": [
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 0,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"ops": [
|
||||
{
|
||||
"name": "upkt",
|
||||
"src_buff": [
|
||||
{
|
||||
"type": "s",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"dst_buff": [
|
||||
{
|
||||
"type": "o",
|
||||
"index": 0,
|
||||
"size": 1
|
||||
}
|
||||
],
|
||||
"tbg_info": {
|
||||
"tb_id": 1,
|
||||
"tbg_size": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"channels": [],
|
||||
"remote_buffer_refs": []
|
||||
}
|
||||
],
|
||||
"channels": [
|
||||
{
|
||||
"channel_type": "memory",
|
||||
"connected_to": [
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"remote_buffers": [
|
||||
{
|
||||
"rank": 0,
|
||||
"type": "s",
|
||||
"access_channel_types": [
|
||||
"memory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"semaphores": []
|
||||
}
|
||||
],
|
||||
"num_threads_per_block": 1024,
|
||||
"use_double_scratch_buffer": true,
|
||||
"buffer_alignment": 16,
|
||||
"min_message_size": 0,
|
||||
"max_message_size": 18446744073709551615
|
||||
}
|
||||
Reference in New Issue
Block a user