Merge branch 'main' into caiorocha/support_tbg_pipeline

2026-05-11 08:50:21 +00:00 · 2026-04-13 21:19:40 +00:00
parent bc6284316a b6d0ca13ca
commit 85bcc9a8a6
24 changed files with 3908 additions and 7 deletions
--- a/.azure-pipelines/templates/ut-executor.yml
+++ b/.azure-pipelines/templates/ut-executor.yml
@@ -0,0 +1,42 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: ExecutorTest
+    displayName: Run executor tests
+    remoteScript: |
+      python3 -m pip install .
+      PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
+      TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -148,3 +148,24 @@ jobs:
      vmssName:         mscclpp-mi300x-ci
      platform:         rocm
      gpuArch:          gfx942
+
+- job: UnitTestExecutor
+  timeoutInMinutes: 60
+  displayName: Test DSL Executor
+  pool:
+    name: msccl-ci-h100
+
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-executor.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -389,7 +389,7 @@ struct EndpointConfig {
    };

    static constexpr int DefaultPort = -1;
-    static constexpr int DefaultGidIndex = 0;
+    static constexpr int DefaultGidIndex = -1;
    static constexpr int DefaultMaxCqSize = 1024;
    static constexpr int DefaultMaxCqPollNum = 1;
    static constexpr int DefaultMaxSendWr = 8192;
@@ -418,7 +418,7 @@ struct EndpointConfig {
    /// Constructor.
    /// @param deviceIndex Device index.
    /// @param port Port number.
-    /// @param gidIndex GID index.
+    /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
    /// @param maxCqSize Maximum send completion queue size.
    /// @param maxCqPollNum Maximum send completion queue poll count.
    /// @param maxSendWr Maximum outstanding send work requests.
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -115,6 +115,10 @@ class Env {
  /// Default is false.
  const bool forceDisableGdr;

+  /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
+  /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
+  const int ibGidIndex;
+
 private:
  Env();

--- a/python/csrc/env_py.cpp
+++ b/python/csrc/env_py.cpp
@@ -23,7 +23,8 @@ void register_env(nb::module_& m) {
      .def_ro("ibv_mode", &Env::ibvMode)
      .def_ro("cache_dir", &Env::cacheDir)
      .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
-      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
+      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
+      .def_ro("ib_gid_index", &Env::ibGidIndex);

  m.def("env", &env);
 }
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
      }
    }

+    // Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
+    if (config_.ib.gidIndex < 0) {
+      config_.ib.gidIndex = env()->ibGidIndex;
+    }
+
    int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;

    ibQp_ = contextImpl.getIbContext(config_.transport)
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -66,7 +66,8 @@ Env::Env()
      forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
      ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
-      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
+      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}

 std::shared_ptr<Env> env() {
  static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
    logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
    logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
    logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
+    logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
  }
  return globalEnv;
 }
--- a/src/core/npkit/npkit.cc
+++ b/src/core/npkit/npkit.cc
@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
  else
    return 25000;
 #else
-  cudaDeviceProp dev_prop;
+  int clockRate;
  MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
-  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
-  return dev_prop.clockRate;
+  MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
+  return clockRate;
 #endif
 }
 #endif
--- a/test/executor-tests/algos/reduce.py
+++ b/test/executor-tests/algos/reduce.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations. It implements a 2-GPU allreduce using the Simple protocol
+with instruction fusion enabled.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0)
+
+        # Signal and wait to ensure scratch data is visible to the remote rank
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0)
+        first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0)
+        first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0)
+        second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0)
+
+        # Final signal/wait to ensure all reduced data is consistent across both ranks
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/algos/reduce_nvls.py
+++ b/test/executor-tests/algos/reduce_nvls.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
+NVLS SwitchChannels. Each GPU reduces its chunk via the
+NVSwitch and broadcasts the result to all other GPUs.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Reducing and Storing the data
+        for gpu in range(gpu_size):
+            buffer_offset = gpu
+            rank = Rank(gpu)
+            input_buffer = rank.get_input_buffer()
+            nvls_chan.at_rank(gpu).reduce(
+                buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+            )
+            nvls_chan.at_rank(gpu).broadcast(
+                src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+            )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/algos/reduce_nvls_pipeline.py
+++ b/test/executor-tests/algos/reduce_nvls_pipeline.py
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Pipeline Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a
+pipeline context using SwitchChannel. Each GPU reduces
+its chunk via the NVSwitch and broadcasts the result, processing data
+in a pipelined loop over fixed-size iterations.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+from mscclpp.language.loop import LoopIterationContext
+
+
+def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Pipeline Reducing and Storing the data
+        with LoopIterationContext(unit=2**20, num_chunks=1):
+            for gpu in range(gpu_size):
+                buffer_offset = gpu
+                rank = Rank(gpu)
+                input_buffer = rank.get_input_buffer()
+                nvls_chan.at_rank(gpu).reduce(
+                    buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+                )
+                nvls_chan.at_rank(gpu).broadcast(
+                    src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+                )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/algos/reduce_pack.py
+++ b/test/executor-tests/algos/reduce_pack.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations. It implements a 2-GPU allreduce with the LL (low-latency)
+packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb=0)
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb=0)
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb=1, packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb=1)
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb=1, packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb=1)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb=1)
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb=2)
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/algos/reduce_pack_tbg.py
+++ b/test/executor-tests/algos/reduce_pack_tbg.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Thread Block Group Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations using thread block groups. It implements a 2-GPU allreduce
+with the LL (low-latency) packet protocol, where multiple thread
+blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb_group=tbg[0])
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb_group=tbg[1], packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb_group=tbg[1])
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb_group=tbg[1], packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb_group=tbg[1])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb_group=tbg[1])
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[2])
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/algos/reduce_tbg.py
+++ b/test/executor-tests/algos/reduce_tbg.py
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Thread Block Group Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations using thread block groups. It implements a 2-GPU allreduce
+with the Simple protocol and instruction fusion, where multiple thread
+blocks cooperate on each operation.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, memory channels, input buffers, and scratch buffers for 2-GPU AllReduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch_tb0 = MemoryChannel(1, 0)
+        first_ch_tb1 = MemoryChannel(1, 0)
+        second_ch_tb0 = MemoryChannel(0, 1)
+        second_ch_tb1 = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+        tbg = ThreadBlockGroup(tb_list=[0, 1])
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb_group=tbg)
+
+        # Signal and wait on both TBs to ensure scratch data is visible to the remote rank
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch_tb0.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[0:1], first_input_buffer[0:1], tb_group=tbg)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch_tb0.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[1:2], first_input_buffer[1:2], tb_group=tbg)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch_tb0.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb_group=tbg)
+        second_ch_tb0.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb_group=tbg)
+
+        # Final signal/wait on both TBs to ensure all reduced data is consistent across both ranks
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/algos/transfer_pack.py
+++ b/test/executor-tests/algos/transfer_pack.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations. It implements a 2-GPU allgather with the LL
+(low-latency) packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb=0)
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1)
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1)
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/algos/transfer_pack_tbg.py
+++ b/test/executor-tests/algos/transfer_pack_tbg.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Thread Block Group Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations using thread block groups. It implements a 2-GPU
+allgather with the LL (low-latency) packet protocol, where multiple
+thread blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb_group=tbg[0])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg[1])
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[1])
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/test/executor-tests/execution-plans/reduce.json
+++ b/test/executor-tests/execution-plans/reduce.json
@@ -0,0 +1,389 @@
+{
+  "name": "reduce",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
--- a/test/executor-tests/execution-plans/reduce_nvls.json
+++ b/test/executor-tests/execution-plans/reduce_nvls.json
@@ -0,0 +1,246 @@
+{
+  "name": "allreduce_nvls",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
--- a/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
+++ b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
@@ -0,0 +1,264 @@
+{
+  "name": "allreduce_nvls_pipeline",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
--- a/test/executor-tests/execution-plans/reduce_pack.json
+++ b/test/executor-tests/execution-plans/reduce_pack.json
@@ -0,0 +1,297 @@
+{
+  "name": "reduce_pack",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
--- a/test/executor-tests/execution-plans/reduce_pack_tbg.json
+++ b/test/executor-tests/execution-plans/reduce_pack_tbg.json
@@ -0,0 +1,576 @@
+{
+  "name": "reduce_pack_tbg",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
--- a/test/executor-tests/execution-plans/reduce_tbg.json
+++ b/test/executor-tests/execution-plans/reduce_tbg.json
@@ -0,0 +1,773 @@
+{
+  "name": "reduce_tbg",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1,
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0,
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
--- a/test/executor-tests/execution-plans/transfer_pack.json
+++ b/test/executor-tests/execution-plans/transfer_pack.json
@@ -0,0 +1,216 @@
+{
+  "name": "transfer_pack",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
--- a/test/executor-tests/execution-plans/transfer_pack_tbg.json
+++ b/test/executor-tests/execution-plans/transfer_pack_tbg.json
@@ -0,0 +1,406 @@
+{
+  "name": "transfer_pack_tbg",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}