Merge branch 'main' into caiorocha/support_tbg_pipeline

This commit is contained in:
Caio Rocha
2026-04-13 21:19:40 +00:00
24 changed files with 3908 additions and 7 deletions

View File

@@ -0,0 +1,42 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: ExecutorTest
displayName: Run executor tests
remoteScript: |
python3 -m pip install .
PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -148,3 +148,24 @@ jobs:
vmssName: mscclpp-mi300x-ci
platform: rocm
gpuArch: gfx942
- job: UnitTestExecutor
timeoutInMinutes: 60
displayName: Test DSL Executor
pool:
name: msccl-ci-h100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-executor.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
gpuArch: '90'

View File

@@ -389,7 +389,7 @@ struct EndpointConfig {
};
static constexpr int DefaultPort = -1;
static constexpr int DefaultGidIndex = 0;
static constexpr int DefaultGidIndex = -1;
static constexpr int DefaultMaxCqSize = 1024;
static constexpr int DefaultMaxCqPollNum = 1;
static constexpr int DefaultMaxSendWr = 8192;
@@ -418,7 +418,7 @@ struct EndpointConfig {
/// Constructor.
/// @param deviceIndex Device index.
/// @param port Port number.
/// @param gidIndex GID index.
/// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
/// @param maxCqSize Maximum send completion queue size.
/// @param maxCqPollNum Maximum send completion queue poll count.
/// @param maxSendWr Maximum outstanding send work requests.

View File

@@ -115,6 +115,10 @@ class Env {
/// Default is false.
const bool forceDisableGdr;
/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
/// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
const int ibGidIndex;
private:
Env();

View File

@@ -23,7 +23,8 @@ void register_env(nb::module_& m) {
.def_ro("ibv_mode", &Env::ibvMode)
.def_ro("cache_dir", &Env::cacheDir)
.def_ro("npkit_dump_dir", &Env::npkitDumpDir)
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
.def_ro("ib_gid_index", &Env::ibGidIndex);
m.def("env", &env);
}

View File

@@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
}
}
// Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
if (config_.ib.gidIndex < 0) {
config_.ib.gidIndex = env()->ibGidIndex;
}
int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
ibQp_ = contextImpl.getIbContext(config_.transport)

View File

@@ -66,7 +66,8 @@ Env::Env()
forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}
std::shared_ptr<Env> env() {
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
}
return globalEnv;
}

View File

@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
else
return 25000;
#else
cudaDeviceProp dev_prop;
int clockRate;
MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
return dev_prop.clockRate;
MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
return clockRate;
#endif
}
#endif

View File

@@ -0,0 +1,87 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Reduce Test
This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
operations. It implements a 2-GPU allreduce using the Simple protocol
with instruction fusion enabled.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def reduce(name, num_threads_per_block, min_message_size, max_message_size):
collective = AllReduce(2, 2, True)
with CollectiveProgram(
name,
collective,
2,
protocol="Simple",
instr_fusion=True,
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=False,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
first_rank = Rank(0)
second_rank = Rank(1)
first_ch = MemoryChannel(1, 0)
second_ch = MemoryChannel(0, 1)
first_input_buffer = first_rank.get_input_buffer()
second_input_buffer = second_rank.get_input_buffer()
first_scratch_buffer = Buffer(0, 4)
second_scratch_buffer = Buffer(1, 4)
# Each rank copies its input chunks to scratch to prepare for remote access
first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0)
second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0)
# Signal and wait to ensure scratch data is visible to the remote rank
first_ch.signal(tb=0)
second_ch.signal(tb=0)
first_ch.wait(tb=0)
second_ch.wait(tb=0)
# Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0)
first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0)
# Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0)
first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0)
first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0)
# Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0)
second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0)
second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0)
# Final signal/wait to ensure all reduced data is consistent across both ranks
first_ch.signal(tb=0)
second_ch.signal(tb=0)
first_ch.wait(tb=0)
second_ch.wait(tb=0)
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,91 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Reduce NVLS Test
This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
NVLS SwitchChannels. Each GPU reduces its chunk via the
NVSwitch and broadcasts the result to all other GPUs.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
chunksperloop = 1
collective = AllReduce(gpu_size, chunksperloop, True)
with CollectiveProgram(
name,
collective,
gpu_size,
instances=1,
protocol="Simple",
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=False,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Creating Channels
nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
channels = {}
for gpu in range(gpu_size):
for peer in range(gpu_size):
if peer != gpu:
channels[(peer, gpu)] = MemoryChannel(peer, gpu)
# Synchronization to Ensure all the GPUs are Ready
for gpu in range(gpu_size):
src_rank = gpu
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
# Reducing and Storing the data
for gpu in range(gpu_size):
buffer_offset = gpu
rank = Rank(gpu)
input_buffer = rank.get_input_buffer()
nvls_chan.at_rank(gpu).reduce(
buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
)
nvls_chan.at_rank(gpu).broadcast(
src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
)
# Synchronization to Ensure the GPUs finished
for gpu in range(gpu_size):
src_rank = gpu
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_gpus", type=int, help="number of gpus")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,94 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Reduce NVLS Pipeline Test
This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a
pipeline context using SwitchChannel. Each GPU reduces
its chunk via the NVSwitch and broadcasts the result, processing data
in a pipelined loop over fixed-size iterations.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
from mscclpp.language.loop import LoopIterationContext
def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
chunksperloop = 1
collective = AllReduce(gpu_size, chunksperloop, True)
with CollectiveProgram(
name,
collective,
gpu_size,
instances=1,
protocol="Simple",
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=False,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Creating Channels
nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
channels = {}
for gpu in range(gpu_size):
for peer in range(gpu_size):
if peer != gpu:
channels[(peer, gpu)] = MemoryChannel(peer, gpu)
# Synchronization to Ensure all the GPUs are Ready
for gpu in range(gpu_size):
src_rank = gpu
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
# Pipeline Reducing and Storing the data
with LoopIterationContext(unit=2**20, num_chunks=1):
for gpu in range(gpu_size):
buffer_offset = gpu
rank = Rank(gpu)
input_buffer = rank.get_input_buffer()
nvls_chan.at_rank(gpu).reduce(
buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
)
nvls_chan.at_rank(gpu).broadcast(
src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
)
# Synchronization to Ensure the GPUs finished
for gpu in range(gpu_size):
src_rank = gpu
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
for peer in range(gpu_size):
if peer != src_rank:
dst_rank = peer
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_gpus", type=int, help="number of gpus")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,73 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Reduce Pack Test
This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
operations. It implements a 2-GPU allreduce with the LL (low-latency)
packet protocol.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def reduce_pack(name, num_threads_per_block, min_message_size, max_message_size):
chunksperloop = 1
gpu_size = 2
collective = AllReduce(gpu_size, chunksperloop, True)
with CollectiveProgram(
name,
collective,
gpu_size,
protocol="LL",
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=True,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
first_rank = Rank(0)
second_rank = Rank(1)
first_ch = MemoryChannel(1, 0)
second_ch = MemoryChannel(0, 1)
first_input_buffer = first_rank.get_input_buffer()
second_input_buffer = second_rank.get_input_buffer()
first_scratch_buffer = Buffer(0, 3)
second_scratch_buffer = Buffer(1, 3)
# Each rank sends its input chunk as packets to the other rank's scratch buffer
first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb=0)
second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb=0)
# Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb=1, packet=True)
first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb=1)
# Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb=1, packet=True)
second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb=1)
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb=1)
# Both ranks unpack the final reduced packets from scratch into their output buffers
first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb=2)
second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb=2)
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
reduce_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,77 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Reduce Pack Thread Block Group Test
This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
operations using thread block groups. It implements a 2-GPU allreduce
with the LL (low-latency) packet protocol, where multiple thread
blocks cooperate on each phase.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def reduce_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
chunksperloop = 1
gpu_size = 2
collective = AllReduce(gpu_size, chunksperloop, True)
with CollectiveProgram(
name,
collective,
gpu_size,
protocol="LL",
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=True,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
first_rank = Rank(0)
second_rank = Rank(1)
first_ch = MemoryChannel(1, 0)
second_ch = MemoryChannel(0, 1)
first_input_buffer = first_rank.get_input_buffer()
second_input_buffer = second_rank.get_input_buffer()
first_scratch_buffer = Buffer(0, 3)
second_scratch_buffer = Buffer(1, 3)
tbg = []
for i in range(3):
tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
# Each rank sends its input chunk as packets to the other rank's scratch buffer
first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb_group=tbg[0])
second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb_group=tbg[0])
# Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb_group=tbg[1], packet=True)
first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb_group=tbg[1])
# Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb_group=tbg[1], packet=True)
second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb_group=tbg[1])
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb_group=tbg[1])
# Both ranks unpack the final reduced packets from scratch into their output buffers
first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[2])
second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
reduce_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,99 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Reduce Thread Block Group Test
This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
operations using thread block groups. It implements a 2-GPU allreduce
with the Simple protocol and instruction fusion, where multiple thread
blocks cooperate on each operation.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def reduce_tbg(name, num_threads_per_block, min_message_size, max_message_size):
collective = AllReduce(2, 2, True)
with CollectiveProgram(
name,
collective,
2,
protocol="Simple",
instr_fusion=True,
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=False,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Setup ranks, memory channels, input buffers, and scratch buffers for 2-GPU AllReduce
first_rank = Rank(0)
second_rank = Rank(1)
first_ch_tb0 = MemoryChannel(1, 0)
first_ch_tb1 = MemoryChannel(1, 0)
second_ch_tb0 = MemoryChannel(0, 1)
second_ch_tb1 = MemoryChannel(0, 1)
first_input_buffer = first_rank.get_input_buffer()
second_input_buffer = second_rank.get_input_buffer()
first_scratch_buffer = Buffer(0, 4)
second_scratch_buffer = Buffer(1, 4)
tbg = ThreadBlockGroup(tb_list=[0, 1])
# Each rank copies its input chunks to scratch to prepare for remote access
first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb_group=tbg)
second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb_group=tbg)
# Signal and wait on both TBs to ensure scratch data is visible to the remote rank
first_ch_tb0.signal(tb=0)
first_ch_tb1.signal(tb=1)
second_ch_tb0.signal(tb=0)
second_ch_tb1.signal(tb=1)
first_ch_tb0.wait(tb=0)
first_ch_tb1.wait(tb=1)
second_ch_tb0.wait(tb=0)
second_ch_tb1.wait(tb=1)
# Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
first_ch_tb0.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb_group=tbg)
first_ch_tb0.put(second_input_buffer[0:1], first_input_buffer[0:1], tb_group=tbg)
# Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
first_ch_tb0.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg)
first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb_group=tbg)
first_ch_tb0.put(second_input_buffer[1:2], first_input_buffer[1:2], tb_group=tbg)
# Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
second_ch_tb0.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb_group=tbg)
second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb_group=tbg)
second_ch_tb0.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb_group=tbg)
# Final signal/wait on both TBs to ensure all reduced data is consistent across both ranks
first_ch_tb0.signal(tb=0)
first_ch_tb1.signal(tb=1)
second_ch_tb0.signal(tb=0)
second_ch_tb1.signal(tb=1)
first_ch_tb0.wait(tb=0)
first_ch_tb1.wait(tb=1)
second_ch_tb0.wait(tb=0)
second_ch_tb1.wait(tb=1)
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
reduce_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,67 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Transfer Pack Test
This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
PUT_PACKETS operations. It implements a 2-GPU allgather with the LL
(low-latency) packet protocol.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def transfer_pack(name, num_threads_per_block, min_message_size, max_message_size):
chunksperloop = 1
gpu_size = 2
collective = AllGather(gpu_size, chunksperloop, True)
with CollectiveProgram(
name,
collective,
gpu_size,
protocol="LL",
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=True,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Setup ranks, channels, output and scratch buffers for 2-GPU allgather
first_rank = Rank(0)
second_rank = Rank(1)
first_ch = MemoryChannel(1, 0)
second_ch = MemoryChannel(0, 1)
first_output_buffer = first_rank.get_output_buffer()
second_output_buffer = second_rank.get_output_buffer()
first_scratch_buffer = Buffer(0, 2)
second_scratch_buffer = Buffer(1, 2)
# Rank 0 sends its output chunk as packets to rank 1's scratch buffer
first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb=0)
# Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0)
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1)
# Both ranks unpack received packets from scratch into their output buffers
first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1)
second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2)
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
transfer_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,71 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Transfer Pack Thread Block Group Test
This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
PUT_PACKETS operations using thread block groups. It implements a 2-GPU
allgather with the LL (low-latency) packet protocol, where multiple
thread blocks cooperate on each phase.
"""
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def transfer_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
chunksperloop = 1
gpu_size = 2
collective = AllGather(gpu_size, chunksperloop, True)
with CollectiveProgram(
name,
collective,
gpu_size,
protocol="LL",
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=True,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Setup ranks, channels, output and scratch buffers for 2-GPU allgather
first_rank = Rank(0)
second_rank = Rank(1)
first_ch = MemoryChannel(1, 0)
second_ch = MemoryChannel(0, 1)
first_output_buffer = first_rank.get_output_buffer()
second_output_buffer = second_rank.get_output_buffer()
first_scratch_buffer = Buffer(0, 2)
second_scratch_buffer = Buffer(1, 2)
tbg = []
for i in range(3):
tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
# Rank 0 sends its output chunk as packets to rank 1's scratch buffer
first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb_group=tbg[0])
# Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb_group=tbg[0])
second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg[1])
# Both ranks unpack received packets from scratch into their output buffers
first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[1])
second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
transfer_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)

View File

@@ -0,0 +1,389 @@
{
"name": "reduce",
"collective": "allreduce",
"protocol": "Simple",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 4,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
]
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rres",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 1,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
},
{
"name": "get",
"src_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "res",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"buffer_id": 1,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0,
1
]
}
]
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
}
],
"remote_buffers": [
{
"rank": 1,
"type": "s",
"access_channel_types": [
"memory"
]
},
{
"rank": 1,
"type": "i",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
},
{
"id": 1,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 4,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 0,
"size": 2
}
]
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rre",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
},
{
"buffer_id": 0,
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"channel_type": "memory",
"reduce_op": "sum"
},
{
"name": "nop"
},
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
]
},
{
"name": "nop"
},
{
"name": "put",
"src_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 2,
"size": 2
}
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
}
],
"remote_buffers": [
{
"rank": 0,
"type": "i",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": false,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}

View File

@@ -0,0 +1,246 @@
{
"name": "allreduce_nvls",
"collective": "allreduce",
"protocol": "Simple",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 0,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "glres",
"src_buff": [
{
"switch_channel_id": 0,
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"switch_channel_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "switch",
"reduce_op": "sum"
},
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
},
{
"channel_type": "switch",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
},
{
"channel_type": "switch",
"buffer_type": "i",
"rank_groups": [
{
"size": 2,
"ranks": [
0,
1
]
}
]
}
],
"remote_buffers": [],
"semaphores": []
},
{
"id": 1,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 0,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "glres",
"src_buff": [
{
"switch_channel_id": 0,
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"switch_channel_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "switch",
"reduce_op": "sum"
},
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
},
{
"channel_type": "switch",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
},
{
"channel_type": "switch",
"buffer_type": "i",
"rank_groups": [
{
"size": 2,
"ranks": [
0,
1
]
}
]
}
],
"remote_buffers": [],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": false,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}

View File

@@ -0,0 +1,264 @@
{
"name": "allreduce_nvls_pipeline",
"collective": "allreduce",
"protocol": "Simple",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 0,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "pipeline",
"iter_context": {
"unit_size": 1048576,
"num_chunks": 1
},
"ops": [
{
"name": "glres",
"src_buff": [
{
"switch_channel_id": 0,
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"switch_channel_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "switch",
"reduce_op": "sum"
}
]
},
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
},
{
"channel_type": "switch",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
},
{
"channel_type": "switch",
"buffer_type": "i",
"rank_groups": [
{
"size": 2,
"ranks": [
0,
1
]
}
]
}
],
"remote_buffers": [],
"semaphores": []
},
{
"id": 1,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 0,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "pipeline",
"iter_context": {
"unit_size": 1048576,
"num_chunks": 1
},
"ops": [
{
"name": "glres",
"src_buff": [
{
"switch_channel_id": 0,
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"switch_channel_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "switch",
"reduce_op": "sum"
}
]
},
{
"name": "nop"
},
{
"name": "rlxsignal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rlxwait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
},
{
"channel_type": "switch",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
},
{
"channel_type": "switch",
"buffer_type": "i",
"rank_groups": [
{
"size": 2,
"ranks": [
0,
1
]
}
]
}
],
"remote_buffers": [],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": false,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}

View File

@@ -0,0 +1,297 @@
{
"name": "reduce_pack",
"collective": "allreduce",
"protocol": "LL",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 3,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "respkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
]
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
}
],
"remote_buffers": [
{
"rank": 1,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
},
{
"id": 1,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 3,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "recspkt",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 2,
"size": 1
},
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
]
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
}
],
"remote_buffers": [
{
"rank": 0,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": true,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}

View File

@@ -0,0 +1,576 @@
{
"name": "reduce_pack_tbg",
"collective": "allreduce",
"protocol": "LL",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 3,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "respkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 3,
"ops": [
{
"name": "respkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 4,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
},
{
"id": 5,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
}
],
"remote_buffers": [
{
"rank": 1,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
},
{
"id": 1,
"input_chunks": 2,
"output_chunks": 2,
"scratch_chunks": 3,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "recspkt",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 2,
"size": 1
},
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 3,
"ops": [
{
"name": "recspkt",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 2,
"size": 1
},
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 4,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
},
{
"id": 5,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
}
],
"remote_buffers": [
{
"rank": 0,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": true,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}

View File

@@ -0,0 +1,773 @@
{
"name": "reduce_tbg",
"collective": "allreduce",
"protocol": "Simple",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 4,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rres",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 1,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "get",
"src_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"channel_ids": [
0
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "res",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"buffer_id": 1,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0,
1
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rres",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 1,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "get",
"src_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "res",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"buffer_id": 1,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
1,
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0,
1
]
}
]
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1,
1
]
}
],
"remote_buffers": [
{
"rank": 1,
"type": "s",
"access_channel_types": [
"memory"
]
},
{
"rank": 1,
"type": "i",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
},
{
"id": 1,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 4,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 0,
"size": 2
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rre",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
},
{
"buffer_id": 0,
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "put",
"src_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 2,
"size": 2
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 0,
"size": 2
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "rre",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
},
{
"buffer_id": 0,
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"channel_type": "memory",
"reduce_op": "sum",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "copy",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "put",
"src_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 2,
"size": 2
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
},
{
"name": "nop"
},
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "memory"
},
{
"name": "nop"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
1,
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0,
0
]
}
],
"remote_buffers": [
{
"rank": 0,
"type": "i",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": false,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}

View File

@@ -0,0 +1,216 @@
{
"name": "transfer_pack",
"collective": "allgather",
"protocol": "LL",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 1,
"output_chunks": 2,
"scratch_chunks": 2,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "o",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "o",
"index": 1,
"size": 1
}
]
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
}
],
"remote_buffers": [
{
"rank": 1,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
},
{
"id": 1,
"input_chunks": 1,
"output_chunks": 2,
"scratch_chunks": 2,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "cpkt",
"src_buff": [
{
"type": "o",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
]
}
],
"channels": [],
"remote_buffer_refs": []
},
{
"id": 1,
"ops": [
{
"name": "rppkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "o",
"index": 0,
"size": 1
}
]
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
}
],
"remote_buffers": [
{
"rank": 0,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": true,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}

View File

@@ -0,0 +1,406 @@
{
"name": "transfer_pack_tbg",
"collective": "allgather",
"protocol": "LL",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 1,
"output_chunks": 2,
"scratch_chunks": 2,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "o",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "o",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "o",
"index": 1,
"size": 1
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
},
{
"id": 3,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "o",
"index": 1,
"size": 1
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
}
],
"remote_buffers": [
{
"rank": 1,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
},
{
"id": 1,
"input_chunks": 1,
"output_chunks": 2,
"scratch_chunks": 2,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "cpkt",
"src_buff": [
{
"type": "o",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
},
{
"id": 1,
"ops": [
{
"name": "cpkt",
"src_buff": [
{
"type": "o",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
},
{
"id": 2,
"ops": [
{
"name": "rppkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 3,
"ops": [
{
"name": "rppkt",
"src_buff": [
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_type": "memory",
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 4,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "o",
"index": 0,
"size": 1
}
],
"tbg_info": {
"tb_id": 0,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
},
{
"id": 5,
"ops": [
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "o",
"index": 0,
"size": 1
}
],
"tbg_info": {
"tb_id": 1,
"tbg_size": 2
}
}
],
"channels": [],
"remote_buffer_refs": []
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
}
],
"remote_buffers": [
{
"rank": 0,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": true,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}