diff --git a/.azure-pipelines/templates/ut-executor.yml b/.azure-pipelines/templates/ut-executor.yml new file mode 100644 index 00000000..426daf17 --- /dev/null +++ b/.azure-pipelines/templates/ut-executor.yml @@ -0,0 +1,42 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: ${{ parameters.platform }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test true ${{ parameters.platform }}' + + +- template: run-remote-task.yml + parameters: + name: ExecutorTest + displayName: Run executor tests + remoteScript: | + python3 -m pip install . + PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans + TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 4e6f96b1..6b8c9eda 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -148,3 +148,24 @@ jobs: vmssName: mscclpp-mi300x-ci platform: rocm gpuArch: gfx942 + +- job: UnitTestExecutor + timeoutInMinutes: 60 + displayName: Test DSL Executor + pool: + name: msccl-ci-h100 + + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + container: + image: $(containerImage) + + steps: + - template: templates/ut-executor.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci + gpuArch: '90' \ No newline at end of file diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 37bdbd51..ca2fc34f 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -389,7 +389,7 @@ struct EndpointConfig { }; static constexpr int DefaultPort = -1; - static constexpr int DefaultGidIndex = 0; + static constexpr int DefaultGidIndex = -1; static constexpr int DefaultMaxCqSize = 1024; static constexpr int DefaultMaxCqPollNum = 1; static constexpr int DefaultMaxSendWr = 8192; @@ -418,7 +418,7 @@ struct EndpointConfig { /// Constructor. /// @param deviceIndex Device index. /// @param port Port number. - /// @param gidIndex GID index. + /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable. /// @param maxCqSize Maximum send completion queue size. /// @param maxCqPollNum Maximum send completion queue poll count. /// @param maxSendWr Maximum outstanding send work requests. diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index fb1da22c..a6dd306b 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -115,6 +115,10 @@ class Env { /// Default is false. const bool forceDisableGdr; + /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport. + /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified). + const int ibGidIndex; + private: Env(); diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp index ce89fd3d..d4b2f5da 100644 --- a/python/csrc/env_py.cpp +++ b/python/csrc/env_py.cpp @@ -23,7 +23,8 @@ void register_env(nb::module_& m) { .def_ro("ibv_mode", &Env::ibvMode) .def_ro("cache_dir", &Env::cacheDir) .def_ro("npkit_dump_dir", &Env::npkitDumpDir) - .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream); + .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream) + .def_ro("ib_gid_index", &Env::ibGidIndex); m.def("env", &env); } diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc index 5ab4bad0..fe51e348 100644 --- a/src/core/endpoint.cc +++ b/src/core/endpoint.cc @@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl) } } + // Resolve GID index: explicit value (>= 0) takes priority, otherwise use env + if (config_.ib.gidIndex < 0) { + config_.ib.gidIndex = env()->ibGidIndex; + } + int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0; ibQp_ = contextImpl.getIbContext(config_.transport) diff --git a/src/core/env.cpp b/src/core/env.cpp index 96f53492..7a42471b 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -66,7 +66,8 @@ Env::Env() forceNcclFallbackOperation(readEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")), ncclSymmetricMemory(readEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)), forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)), - forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)) {} + forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)), + ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)) {} std::shared_ptr env() { static std::shared_ptr globalEnv = std::shared_ptr(new Env()); @@ -95,6 +96,7 @@ std::shared_ptr env() { logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory); logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls); logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr); + logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex); } return globalEnv; } diff --git a/src/core/npkit/npkit.cc b/src/core/npkit/npkit.cc index 30fc35c7..84457abf 100644 --- a/src/core/npkit/npkit.cc +++ b/src/core/npkit/npkit.cc @@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() { else return 25000; #else - cudaDeviceProp dev_prop; + int clockRate; MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id)); - MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id)); - return dev_prop.clockRate; + MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id)); + return clockRate; #endif } #endif diff --git a/test/executor-tests/algos/reduce.py b/test/executor-tests/algos/reduce.py new file mode 100644 index 00000000..db630a43 --- /dev/null +++ b/test/executor-tests/algos/reduce.py @@ -0,0 +1,87 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Reduce Test + +This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND +operations. It implements a 2-GPU allreduce using the Simple protocol +with instruction fusion enabled. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def reduce(name, num_threads_per_block, min_message_size, max_message_size): + collective = AllReduce(2, 2, True) + with CollectiveProgram( + name, + collective, + 2, + protocol="Simple", + instr_fusion=True, + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=False, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce + first_rank = Rank(0) + second_rank = Rank(1) + first_ch = MemoryChannel(1, 0) + second_ch = MemoryChannel(0, 1) + first_input_buffer = first_rank.get_input_buffer() + second_input_buffer = second_rank.get_input_buffer() + first_scratch_buffer = Buffer(0, 4) + second_scratch_buffer = Buffer(1, 4) + + # Each rank copies its input chunks to scratch to prepare for remote access + first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0) + second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0) + + # Signal and wait to ensure scratch data is visible to the remote rank + first_ch.signal(tb=0) + second_ch.signal(tb=0) + + first_ch.wait(tb=0) + second_ch.wait(tb=0) + + # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks + first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0) + first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0) + + # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks + first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0) + first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0) + first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0) + + # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks + second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0) + second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0) + second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0) + + # Final signal/wait to ensure all reduced data is consistent across both ranks + first_ch.signal(tb=0) + second_ch.signal(tb=0) + + first_ch.wait(tb=0) + second_ch.wait(tb=0) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/reduce_nvls.py b/test/executor-tests/algos/reduce_nvls.py new file mode 100644 index 00000000..e59b8247 --- /dev/null +++ b/test/executor-tests/algos/reduce_nvls.py @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Reduce NVLS Test + +This file tests the executor MULTI_LOAD_REDUCE_STORE operation using +NVLS SwitchChannels. Each GPU reduces its chunk via the +NVSwitch and broadcasts the result to all other GPUs. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + collective = AllReduce(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + instances=1, + protocol="Simple", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=False, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Creating Channels + nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input) + channels = {} + for gpu in range(gpu_size): + for peer in range(gpu_size): + if peer != gpu: + channels[(peer, gpu)] = MemoryChannel(peer, gpu) + + # Synchronization to Ensure all the GPUs are Ready + for gpu in range(gpu_size): + src_rank = gpu + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True) + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after) + + # Reducing and Storing the data + for gpu in range(gpu_size): + buffer_offset = gpu + rank = Rank(gpu) + input_buffer = rank.get_input_buffer() + nvls_chan.at_rank(gpu).reduce( + buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0 + ) + nvls_chan.at_rank(gpu).broadcast( + src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0 + ) + + # Synchronization to Ensure the GPUs finished + for gpu in range(gpu_size): + src_rank = gpu + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before) + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_gpus", type=int, help="number of gpus") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/reduce_nvls_pipeline.py b/test/executor-tests/algos/reduce_nvls_pipeline.py new file mode 100644 index 00000000..d7a4925e --- /dev/null +++ b/test/executor-tests/algos/reduce_nvls_pipeline.py @@ -0,0 +1,94 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Reduce NVLS Pipeline Test + +This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a +pipeline context using SwitchChannel. Each GPU reduces +its chunk via the NVSwitch and broadcasts the result, processing data +in a pipelined loop over fixed-size iterations. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * +from mscclpp.language.loop import LoopIterationContext + + +def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + collective = AllReduce(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + instances=1, + protocol="Simple", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=False, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Creating Channels + nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input) + channels = {} + for gpu in range(gpu_size): + for peer in range(gpu_size): + if peer != gpu: + channels[(peer, gpu)] = MemoryChannel(peer, gpu) + + # Synchronization to Ensure all the GPUs are Ready + for gpu in range(gpu_size): + src_rank = gpu + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True) + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after) + + # Pipeline Reducing and Storing the data + with LoopIterationContext(unit=2**20, num_chunks=1): + for gpu in range(gpu_size): + buffer_offset = gpu + rank = Rank(gpu) + input_buffer = rank.get_input_buffer() + nvls_chan.at_rank(gpu).reduce( + buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0 + ) + nvls_chan.at_rank(gpu).broadcast( + src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0 + ) + + # Synchronization to Ensure the GPUs finished + for gpu in range(gpu_size): + src_rank = gpu + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before) + for peer in range(gpu_size): + if peer != src_rank: + dst_rank = peer + channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_gpus", type=int, help="number of gpus") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/reduce_pack.py b/test/executor-tests/algos/reduce_pack.py new file mode 100644 index 00000000..9aa48caf --- /dev/null +++ b/test/executor-tests/algos/reduce_pack.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Reduce Pack Test + +This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS +operations. It implements a 2-GPU allreduce with the LL (low-latency) +packet protocol. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def reduce_pack(name, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + gpu_size = 2 + collective = AllReduce(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="LL", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=True, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce + first_rank = Rank(0) + second_rank = Rank(1) + first_ch = MemoryChannel(1, 0) + second_ch = MemoryChannel(0, 1) + first_input_buffer = first_rank.get_input_buffer() + second_input_buffer = second_rank.get_input_buffer() + first_scratch_buffer = Buffer(0, 3) + second_scratch_buffer = Buffer(1, 3) + + # Each rank sends its input chunk as packets to the other rank's scratch buffer + first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb=0) + second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb=0) + + # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch + first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb=1, packet=True) + first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb=1) + + # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch + second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb=1, packet=True) + second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb=1) + second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb=1) + + # Both ranks unpack the final reduced packets from scratch into their output buffers + first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb=2) + second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb=2) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +reduce_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/reduce_pack_tbg.py b/test/executor-tests/algos/reduce_pack_tbg.py new file mode 100644 index 00000000..eaca4c4c --- /dev/null +++ b/test/executor-tests/algos/reduce_pack_tbg.py @@ -0,0 +1,77 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Reduce Pack Thread Block Group Test + +This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS +operations using thread block groups. It implements a 2-GPU allreduce +with the LL (low-latency) packet protocol, where multiple thread +blocks cooperate on each phase. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def reduce_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + gpu_size = 2 + collective = AllReduce(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="LL", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=True, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce + first_rank = Rank(0) + second_rank = Rank(1) + first_ch = MemoryChannel(1, 0) + second_ch = MemoryChannel(0, 1) + first_input_buffer = first_rank.get_input_buffer() + second_input_buffer = second_rank.get_input_buffer() + first_scratch_buffer = Buffer(0, 3) + second_scratch_buffer = Buffer(1, 3) + tbg = [] + for i in range(3): + tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1])) + + # Each rank sends its input chunk as packets to the other rank's scratch buffer + first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb_group=tbg[0]) + second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb_group=tbg[0]) + + # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch + first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb_group=tbg[1], packet=True) + first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb_group=tbg[1]) + + # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch + second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb_group=tbg[1], packet=True) + second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb_group=tbg[1]) + second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb_group=tbg[1]) + + # Both ranks unpack the final reduced packets from scratch into their output buffers + first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[2]) + second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2]) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +reduce_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/reduce_tbg.py b/test/executor-tests/algos/reduce_tbg.py new file mode 100644 index 00000000..103c6d20 --- /dev/null +++ b/test/executor-tests/algos/reduce_tbg.py @@ -0,0 +1,99 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Reduce Thread Block Group Test + +This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND +operations using thread block groups. It implements a 2-GPU allreduce +with the Simple protocol and instruction fusion, where multiple thread +blocks cooperate on each operation. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def reduce_tbg(name, num_threads_per_block, min_message_size, max_message_size): + collective = AllReduce(2, 2, True) + with CollectiveProgram( + name, + collective, + 2, + protocol="Simple", + instr_fusion=True, + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=False, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, memory channels, input buffers, and scratch buffers for 2-GPU AllReduce + first_rank = Rank(0) + second_rank = Rank(1) + first_ch_tb0 = MemoryChannel(1, 0) + first_ch_tb1 = MemoryChannel(1, 0) + second_ch_tb0 = MemoryChannel(0, 1) + second_ch_tb1 = MemoryChannel(0, 1) + first_input_buffer = first_rank.get_input_buffer() + second_input_buffer = second_rank.get_input_buffer() + first_scratch_buffer = Buffer(0, 4) + second_scratch_buffer = Buffer(1, 4) + tbg = ThreadBlockGroup(tb_list=[0, 1]) + + # Each rank copies its input chunks to scratch to prepare for remote access + first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb_group=tbg) + second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb_group=tbg) + + # Signal and wait on both TBs to ensure scratch data is visible to the remote rank + first_ch_tb0.signal(tb=0) + first_ch_tb1.signal(tb=1) + second_ch_tb0.signal(tb=0) + second_ch_tb1.signal(tb=1) + + first_ch_tb0.wait(tb=0) + first_ch_tb1.wait(tb=1) + second_ch_tb0.wait(tb=0) + second_ch_tb1.wait(tb=1) + + # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks + first_ch_tb0.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb_group=tbg) + first_ch_tb0.put(second_input_buffer[0:1], first_input_buffer[0:1], tb_group=tbg) + + # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks + first_ch_tb0.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg) + first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb_group=tbg) + first_ch_tb0.put(second_input_buffer[1:2], first_input_buffer[1:2], tb_group=tbg) + + # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks + second_ch_tb0.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb_group=tbg) + second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb_group=tbg) + second_ch_tb0.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb_group=tbg) + + # Final signal/wait on both TBs to ensure all reduced data is consistent across both ranks + first_ch_tb0.signal(tb=0) + first_ch_tb1.signal(tb=1) + second_ch_tb0.signal(tb=0) + second_ch_tb1.signal(tb=1) + + first_ch_tb0.wait(tb=0) + first_ch_tb1.wait(tb=1) + second_ch_tb0.wait(tb=0) + second_ch_tb1.wait(tb=1) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +reduce_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/transfer_pack.py b/test/executor-tests/algos/transfer_pack.py new file mode 100644 index 00000000..e382f012 --- /dev/null +++ b/test/executor-tests/algos/transfer_pack.py @@ -0,0 +1,67 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Transfer Pack Test + +This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and +PUT_PACKETS operations. It implements a 2-GPU allgather with the LL +(low-latency) packet protocol. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def transfer_pack(name, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + gpu_size = 2 + collective = AllGather(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="LL", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=True, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, channels, output and scratch buffers for 2-GPU allgather + first_rank = Rank(0) + second_rank = Rank(1) + first_ch = MemoryChannel(1, 0) + second_ch = MemoryChannel(0, 1) + first_output_buffer = first_rank.get_output_buffer() + second_output_buffer = second_rank.get_output_buffer() + first_scratch_buffer = Buffer(0, 2) + second_scratch_buffer = Buffer(1, 2) + + # Rank 0 sends its output chunk as packets to rank 1's scratch buffer + first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb=0) + + # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer + second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0) + second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1) + + # Both ranks unpack received packets from scratch into their output buffers + first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1) + second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +transfer_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/algos/transfer_pack_tbg.py b/test/executor-tests/algos/transfer_pack_tbg.py new file mode 100644 index 00000000..5a2dc11b --- /dev/null +++ b/test/executor-tests/algos/transfer_pack_tbg.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Transfer Pack Thread Block Group Test + +This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and +PUT_PACKETS operations using thread block groups. It implements a 2-GPU +allgather with the LL (low-latency) packet protocol, where multiple +thread blocks cooperate on each phase. +""" + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def transfer_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + gpu_size = 2 + collective = AllGather(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="LL", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=True, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Setup ranks, channels, output and scratch buffers for 2-GPU allgather + first_rank = Rank(0) + second_rank = Rank(1) + first_ch = MemoryChannel(1, 0) + second_ch = MemoryChannel(0, 1) + first_output_buffer = first_rank.get_output_buffer() + second_output_buffer = second_rank.get_output_buffer() + first_scratch_buffer = Buffer(0, 2) + second_scratch_buffer = Buffer(1, 2) + tbg = [] + for i in range(3): + tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1])) + + # Rank 0 sends its output chunk as packets to rank 1's scratch buffer + first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb_group=tbg[0]) + + # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer + second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb_group=tbg[0]) + second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg[1]) + + # Both ranks unpack received packets from scratch into their output buffers + first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[1]) + second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2]) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +transfer_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/test/executor-tests/execution-plans/reduce.json b/test/executor-tests/execution-plans/reduce.json new file mode 100644 index 00000000..49a1048a --- /dev/null +++ b/test/executor-tests/execution-plans/reduce.json @@ -0,0 +1,389 @@ +{ + "name": "reduce", + "collective": "allreduce", + "protocol": "Simple", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 4, + "output_chunks": 4, + "scratch_chunks": 4, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ] + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rres", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 1, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum" + }, + { + "name": "get", + "src_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "res", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "buffer_id": 1, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum" + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0, + 1 + ] + } + ] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "s", + "access_channel_types": [ + "memory" + ] + }, + { + "rank": 1, + "type": "i", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 4, + "output_chunks": 4, + "scratch_chunks": 4, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 0, + "size": 2 + } + ] + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rre", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + }, + { + "buffer_id": 0, + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "channel_type": "memory", + "reduce_op": "sum" + }, + { + "name": "nop" + }, + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ] + }, + { + "name": "nop" + }, + { + "name": "put", + "src_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 2, + "size": 2 + } + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "i", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": false, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/reduce_nvls.json b/test/executor-tests/execution-plans/reduce_nvls.json new file mode 100644 index 00000000..ac1261d6 --- /dev/null +++ b/test/executor-tests/execution-plans/reduce_nvls.json @@ -0,0 +1,246 @@ +{ + "name": "allreduce_nvls", + "collective": "allreduce", + "protocol": "Simple", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "glres", + "src_buff": [ + { + "switch_channel_id": 0, + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "switch_channel_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "switch", + "reduce_op": "sum" + }, + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + }, + { + "channel_type": "switch", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1 + ] + }, + { + "channel_type": "switch", + "buffer_type": "i", + "rank_groups": [ + { + "size": 2, + "ranks": [ + 0, + 1 + ] + } + ] + } + ], + "remote_buffers": [], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "glres", + "src_buff": [ + { + "switch_channel_id": 0, + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "switch_channel_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "switch", + "reduce_op": "sum" + }, + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + }, + { + "channel_type": "switch", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0 + ] + }, + { + "channel_type": "switch", + "buffer_type": "i", + "rank_groups": [ + { + "size": 2, + "ranks": [ + 0, + 1 + ] + } + ] + } + ], + "remote_buffers": [], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": false, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/reduce_nvls_pipeline.json b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json new file mode 100644 index 00000000..c9fb0760 --- /dev/null +++ b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json @@ -0,0 +1,264 @@ +{ + "name": "allreduce_nvls_pipeline", + "collective": "allreduce", + "protocol": "Simple", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "pipeline", + "iter_context": { + "unit_size": 1048576, + "num_chunks": 1 + }, + "ops": [ + { + "name": "glres", + "src_buff": [ + { + "switch_channel_id": 0, + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "switch_channel_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "switch", + "reduce_op": "sum" + } + ] + }, + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + }, + { + "channel_type": "switch", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1 + ] + }, + { + "channel_type": "switch", + "buffer_type": "i", + "rank_groups": [ + { + "size": 2, + "ranks": [ + 0, + 1 + ] + } + ] + } + ], + "remote_buffers": [], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "pipeline", + "iter_context": { + "unit_size": 1048576, + "num_chunks": 1 + }, + "ops": [ + { + "name": "glres", + "src_buff": [ + { + "switch_channel_id": 0, + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "switch_channel_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "switch", + "reduce_op": "sum" + } + ] + }, + { + "name": "nop" + }, + { + "name": "rlxsignal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rlxwait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + }, + { + "channel_type": "switch", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0 + ] + }, + { + "channel_type": "switch", + "buffer_type": "i", + "rank_groups": [ + { + "size": 2, + "ranks": [ + 0, + 1 + ] + } + ] + } + ], + "remote_buffers": [], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": false, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/reduce_pack.json b/test/executor-tests/execution-plans/reduce_pack.json new file mode 100644 index 00000000..b74d5772 --- /dev/null +++ b/test/executor-tests/execution-plans/reduce_pack.json @@ -0,0 +1,297 @@ +{ + "name": "reduce_pack", + "collective": "allreduce", + "protocol": "LL", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 3, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "respkt", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 3, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "recspkt", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 2, + "size": 1 + }, + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": true, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/reduce_pack_tbg.json b/test/executor-tests/execution-plans/reduce_pack_tbg.json new file mode 100644 index 00000000..4380de6e --- /dev/null +++ b/test/executor-tests/execution-plans/reduce_pack_tbg.json @@ -0,0 +1,576 @@ +{ + "name": "reduce_pack_tbg", + "collective": "allreduce", + "protocol": "LL", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 3, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "respkt", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 3, + "ops": [ + { + "name": "respkt", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 4, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 5, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 2, + "output_chunks": 2, + "scratch_chunks": 3, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "recspkt", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 2, + "size": 1 + }, + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 3, + "ops": [ + { + "name": "recspkt", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 2, + "size": 1 + }, + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 4, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 5, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": true, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/reduce_tbg.json b/test/executor-tests/execution-plans/reduce_tbg.json new file mode 100644 index 00000000..a4683236 --- /dev/null +++ b/test/executor-tests/execution-plans/reduce_tbg.json @@ -0,0 +1,773 @@ +{ + "name": "reduce_tbg", + "collective": "allreduce", + "protocol": "Simple", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 4, + "output_chunks": 4, + "scratch_chunks": 4, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rres", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 1, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "get", + "src_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "channel_ids": [ + 0 + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "res", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "buffer_id": 1, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0, + 1 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rres", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + }, + { + "buffer_id": 1, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "get", + "src_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "res", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + }, + { + "buffer_id": 1, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 1, + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0, + 1 + ] + } + ] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1, + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "s", + "access_channel_types": [ + "memory" + ] + }, + { + "rank": 1, + "type": "i", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 4, + "output_chunks": 4, + "scratch_chunks": 4, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 0, + "size": 2 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rre", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + }, + { + "buffer_id": 0, + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "put", + "src_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 2, + "size": 2 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 0, + "size": 2 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "rre", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + }, + { + "buffer_id": 0, + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "channel_type": "memory", + "reduce_op": "sum", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "copy", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "put", + "src_buff": [ + { + "type": "s", + "index": 2, + "size": 2 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 2, + "size": 2 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + }, + { + "name": "nop" + }, + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "memory" + }, + { + "name": "nop" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 1, + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0, + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "i", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": false, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/transfer_pack.json b/test/executor-tests/execution-plans/transfer_pack.json new file mode 100644 index 00000000..270d6c13 --- /dev/null +++ b/test/executor-tests/execution-plans/transfer_pack.json @@ -0,0 +1,216 @@ +{ + "name": "transfer_pack", + "collective": "allgather", + "protocol": "LL", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 1, + "output_chunks": 2, + "scratch_chunks": 2, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 1, + "output_chunks": 2, + "scratch_chunks": 2, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "cpkt", + "src_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 1, + "ops": [ + { + "name": "rppkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory" + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ] + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": true, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +} diff --git a/test/executor-tests/execution-plans/transfer_pack_tbg.json b/test/executor-tests/execution-plans/transfer_pack_tbg.json new file mode 100644 index 00000000..bec8459d --- /dev/null +++ b/test/executor-tests/execution-plans/transfer_pack_tbg.json @@ -0,0 +1,406 @@ +{ + "name": "transfer_pack_tbg", + "collective": "allgather", + "protocol": "LL", + "inplace": true, + "reuse_resources": false, + "gpus": [ + { + "id": 0, + "input_chunks": 1, + "output_chunks": 2, + "scratch_chunks": 2, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "ppkt", + "src_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 3, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 1, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + }, + { + "id": 1, + "input_chunks": 1, + "output_chunks": 2, + "scratch_chunks": 2, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "cpkt", + "src_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 1, + "ops": [ + { + "name": "cpkt", + "src_buff": [ + { + "type": "o", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 2, + "ops": [ + { + "name": "rppkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 3, + "ops": [ + { + "name": "rppkt", + "src_buff": [ + { + "type": "s", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_type": "memory", + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [ + { + "channel_type": "memory", + "channel_ids": [ + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "memory", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 4, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 0, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + }, + { + "id": 5, + "ops": [ + { + "name": "upkt", + "src_buff": [ + { + "type": "s", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "type": "o", + "index": 0, + "size": 1 + } + ], + "tbg_info": { + "tb_id": 1, + "tbg_size": 2 + } + } + ], + "channels": [], + "remote_buffer_refs": [] + } + ], + "channels": [ + { + "channel_type": "memory", + "connected_to": [ + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 0, + "type": "s", + "access_channel_types": [ + "memory" + ] + } + ], + "semaphores": [] + } + ], + "num_threads_per_block": 1024, + "use_double_scratch_buffer": true, + "buffer_alignment": 16, + "min_message_size": 0, + "max_message_size": 18446744073709551615 +}