Torch integration (#692)

Reorganize current native algorithm implementation and DSL algorithm implementation. Provide unified API for DSL algo and native algo and provide interface to tune the algo Provide interface for pytorch integration with native API and DSL --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
2026-04-20 06:49:29 +00:00 · 2026-01-21 20:32:24 -08:00
parent 78ce9fac8d
commit a707273701
156 changed files with 6107 additions and 4076 deletions
--- a/.azure-pipelines/integration-test-rocm.yml
+++ b/.azure-pipelines/integration-test-rocm.yml
@@ -94,7 +94,7 @@ jobs:
      script: |
        set -e
        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \
+        sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
      workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -106,7 +106,7 @@ jobs:
      script: |
        set -e
        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
+        sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
          -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
          -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yaml
@@ -74,13 +74,13 @@ steps:
      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH;  \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
        cd /root/mscclpp;                                         \
        set -e;                                                   \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -101,9 +101,9 @@ steps:
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
        set -e;                                                   \
        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH;  \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -124,15 +124,15 @@ steps:
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
        set -e;                                                   \
        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH;  \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -152,10 +152,10 @@ steps:
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
        set -e;                                                   \
        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -177,7 +177,7 @@ steps:
        set -e;                                                   \
        cd /root/mscclpp;                                         \
        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -200,7 +200,7 @@ steps:
        set -e;                                                    \
        cd /root/mscclpp;                                          \
        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
        python3 -m pip install .;                                     \
        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
      kill $CHILD_PID
@@ -223,9 +223,9 @@ steps:
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
        set -e;                                                   \
        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
        cd /root/mscclpp;                                         \
-        ./build/test/perf/fifo_test"'
+        ./build/bin/perf/fifo_test"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -130,7 +130,7 @@ steps:
 #       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
 #         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
 #         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
 #     workingDirectory: '$(System.DefaultWorkingDirectory)'

 # - task: Bash@3
@@ -147,7 +147,7 @@ steps:
 #       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
 #         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
 #         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
 #     workingDirectory: '$(System.DefaultWorkingDirectory)'

 # - task: Bash@3
@@ -164,7 +164,7 @@ steps:
 #       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
 #         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
 #         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
 #     workingDirectory: '$(System.DefaultWorkingDirectory)'

 - task: Bash@3
@@ -198,11 +198,12 @@ steps:
      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
    workingDirectory: '$(System.DefaultWorkingDirectory)'

 - task: Bash@3
@@ -218,11 +219,12 @@ steps:
      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
    workingDirectory: '$(System.DefaultWorkingDirectory)'

 - task: Bash@3
@@ -238,11 +240,12 @@ steps:
      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
    workingDirectory: '$(System.DefaultWorkingDirectory)'

 # - task: Bash@3
@@ -259,10 +262,10 @@ steps:
 #       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
 #         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
 #         cd /root/mscclpp;                                         \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
+#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
+#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
+#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
 #     workingDirectory: '$(System.DefaultWorkingDirectory)'

 - task: AzureCLI@2
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -71,7 +71,7 @@ steps:
      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH;  \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
        cd /root/mscclpp;                                              \
        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"'
      kill $CHILD_PID
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -87,8 +87,8 @@ steps:
        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
        export PATH=/usr/local/mpi/bin:\$PATH; \
        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
@@ -116,7 +116,7 @@ steps:
        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
        export PATH=/usr/local/mpi/bin:\$PATH; \
        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH;  \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -72,8 +72,8 @@ steps:
      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
-        ./build/test/unit_tests"'
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
+        ./build/bin/unit_tests"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -94,10 +94,10 @@ steps:
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
        export PATH=/usr/local/mpi/bin:\$PATH;                        \
        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests"'
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
+        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
+        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -117,7 +117,7 @@ steps:
      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH;  \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
        cd /root/mscclpp;                                              \
        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
      kill $CHILD_PID
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -41,9 +41,9 @@ cd ..
 For testing after successful build:
 ```bash
 # To run all tests
-mpirun -np 2 ./build/test/mp_unit_tests
+mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/test/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
 ```

 For building a Python package:
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -36,32 +36,32 @@ jobs:
      - name: Run mscclpp AllGather test
        run: |
          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl

      - name: Run mscclpp SendRecv test
        run: |
          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl

      - name: Run mscclpp AllReduce test
        run: |
          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl

      - name: Run mscclpp AllToAll test
        run: |
          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl

      - name: Check collective primitives performance
        run: |
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -37,14 +37,14 @@ jobs:

      - name: UnitTests
        run: |
-          ./build/test/unit_tests
+          ./build/bin/unit_tests

      - name: MpUnitTests
        run: |
          set -e
-          mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
+          mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+          mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+          mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests

      - name: PyTests
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ __pycache__
 .*.swp
 .idea/
 *.so
+_codeql_detected_source_root
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,11 @@ endif()
 set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
 set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")

+# Set output directories for all targets
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
 find_package(Git)
 set(GIT_HASH "UNKNOWN")
 if(Git_FOUND)
@@ -44,7 +49,8 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
 option(MSCCLPP_BUILD_TESTS "Build tests" ON)
 option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
-option(MSCCLPP_BUILD_APPS_NCCL "Build NCCL interfaces" ON)
+option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
+option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
 option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
 option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
@@ -164,54 +170,12 @@ include(FetchContent)
 FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
 FetchContent_MakeAvailable(json)

-add_library(mscclpp_obj OBJECT)
-target_include_directories(mscclpp_obj
-    SYSTEM PRIVATE
-    ${GPU_INCLUDE_DIRS}
-    ${NUMA_INCLUDE_DIRS})
-target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl)
-if(MSCCLPP_USE_IB)
-    target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
-    target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
-    target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
-endif()
-set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-if(MSCCLPP_USE_CUDA)
-    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_CUDA)
-elseif(MSCCLPP_USE_ROCM)
-    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_ROCM)
-    foreach(arch ${MSCCLPP_GPU_ARCHS})
-        target_compile_options(mscclpp_obj PRIVATE --offload-arch=${arch})
-    endforeach()
-endif()
-if(MSCCLPP_ENABLE_TRACE)
-    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_ENABLE_TRACE)
-endif()
-if(MSCCLPP_NPKIT_FLAGS)
-    target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
-endif()
-
-# libmscclpp
-add_library(mscclpp SHARED)
-target_link_libraries(mscclpp PUBLIC mscclpp_obj)
-set_target_properties(mscclpp PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-add_library(mscclpp_static STATIC)
-target_link_libraries(mscclpp_static PUBLIC mscclpp_obj)
-set_target_properties(mscclpp_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-
-add_subdirectory(include)
-add_subdirectory(src)
-
 if("${INSTALL_PREFIX}" STREQUAL "")
    set(INSTALL_PREFIX "./")
 endif()

-install(TARGETS mscclpp_obj
-    FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include)
-install(TARGETS mscclpp
-    LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
-install(TARGETS mscclpp_static
-    ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)
+add_subdirectory(src)
+add_subdirectory(include)

 # Tests
 if(MSCCLPP_BUILD_TESTS)
@@ -223,8 +187,3 @@ endif()
 if(MSCCLPP_BUILD_PYTHON_BINDINGS)
    add_subdirectory(python)
 endif()
-
-# NCCL interfaces
-if(MSCCLPP_BUILD_APPS_NCCL)
-    add_subdirectory(apps/nccl)
-endif()
--- a/apps/nccl/CMakeLists.txt
+++ b/apps/nccl/CMakeLists.txt
@@ -1,51 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/*.cu)
-file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/*.h)
-file(GLOB_RECURSE AUDIT_SHIM CONFIGURE_DEPENDS audit-shim/*)
-
-if(MSCCLPP_USE_ROCM)
-    set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CXX)
-endif()
-
-add_library(mscclpp_nccl_obj OBJECT)
-target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES})
-target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS})
-target_include_directories(mscclpp_nccl_obj PRIVATE include ${PROJECT_SOURCE_DIR}/src/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
-target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} PUBLIC mscclpp_obj)
-set_target_properties(mscclpp_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-if(MSCCLPP_USE_CUDA)
-    target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_CUDA)
-elseif(MSCCLPP_USE_ROCM)
-    target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_ROCM)
-    foreach(arch ${MSCCLPP_GPU_ARCHS})
-      target_compile_options(mscclpp_nccl_obj PRIVATE --offload-arch=${arch})
-    endforeach()
-endif()
-if(MSCCLPP_NPKIT_FLAGS)
-    target_compile_definitions(mscclpp_nccl_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
-endif()
-add_library(mscclpp_nccl SHARED)
-target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj)
-set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-add_library(mscclpp_nccl_static STATIC)
-target_link_libraries(mscclpp_nccl_static PUBLIC mscclpp_obj mscclpp_nccl_obj)
-set_target_properties(mscclpp_nccl_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-
-add_library(mscclpp_audit_nccl_obj OBJECT)
-set_target_properties(mscclpp_audit_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-target_sources(mscclpp_audit_nccl_obj PRIVATE ${AUDIT_SHIM})
-
-add_library(mscclpp_audit_nccl SHARED)
-target_link_libraries(mscclpp_audit_nccl PUBLIC mscclpp_audit_nccl_obj)
-set_target_properties(mscclpp_audit_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
-
-install(TARGETS mscclpp_nccl_obj
-    FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX})
-install(TARGETS mscclpp_nccl
-    LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
-install(TARGETS mscclpp_nccl_static
-    ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)
-install(TARGETS mscclpp_audit_nccl
-    LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
--- a/apps/nccl/src/allgather.cu
+++ b/apps/nccl/src/allgather.cu
@@ -1,220 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <mscclpp/nccl.h>
-
-#include <mscclpp/algorithm.hpp>
-#include <mscclpp/env.hpp>
-#include <mscclpp/gpu_utils.hpp>
-
-#include "allgather.hpp"
-#include "datatype_conversion.hpp"
-#include "debug.h"
-
-AllgatherAlgo6::AllgatherAlgo6() : disableChannelCache_(false) {
-  if (mscclpp::env()->disableChannelCache) {
-    disableChannelCache_ = true;
-  }
-}
-
-void AllgatherAlgo6::initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  this->conns_ = setupConnections(comm);
-  this->memorySemaphores_ = std::move(setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_));
-}
-
-ncclResult_t AllgatherAlgo6::allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
-                                                 void* output, size_t count, mscclpp::DataType dtype,
-                                                 cudaStream_t stream,
-                                                 std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  int nBlocks = 28;
-  const size_t bytes = count * getDataTypeSize(dtype);
-  const size_t nElem = bytes / sizeof(int);
-  int rank = ctx->rank;
-  if (bytes <= 32 * (1 << 20)) {
-    if (nElem <= 4096) {
-      nBlocks = 7;
-    } else if (nElem <= 32768) {
-      nBlocks = 14;
-    } else if (nElem >= 2097152) {
-      nBlocks = 35;
-    }
-  } else {
-    nBlocks = 35;
-  }
-
-  size_t channelOutOffset = *static_cast<size_t*>(ctx->extras["channel_out_offset"].get());
-  if ((char*)input == (char*)output + rank * bytes) {
-    allgather6<false><<<nBlocks, 1024, 0, stream>>>((void*)input, ctx->memoryChannelDeviceHandles.get(),
-                                                    channelOutOffset, ctx->rank, ctx->workSize, ctx->nRanksPerNode,
-                                                    nElem);
-  } else {
-    allgather6<true><<<nBlocks, 1024, 0, stream>>>((void*)input, ctx->memoryChannelDeviceHandles.get(),
-                                                   channelOutOffset, ctx->rank, ctx->workSize, ctx->nRanksPerNode,
-                                                   nElem);
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    WARN("AllgatherAlgo6 failed with error %d", err);
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> AllgatherAlgo6::initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
-                                                                            const void*, void* output, size_t count,
-                                                                            mscclpp::DataType dtype) {
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-
-  // setup semaphores
-  ctx->memorySemaphores = this->memorySemaphores_;
-
-  size_t bytes = count * getDataTypeSize(dtype);
-  size_t recvBytes;
-  CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
-  if (disableChannelCache_) {
-    channelOutOffset = 0;
-    recvBytes = bytes;
-    recvBasePtr = (CUdeviceptr)output;
-  }
-  ctx->extras.insert({"channel_out_offset", std::make_shared<size_t>(channelOutOffset)});
-
-  // register the memory for the broadcast operation
-  mscclpp::RegisteredMemory localMemory =
-      comm->registerMemory((void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
-  std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
-  ctx->memoryChannels = std::move(
-      setupMemoryChannels(this->conns_, ctx->memorySemaphores, remoteMemories, localMemory, nChannelsPerConnection_));
-  ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
-
-  // keep registered memories reference
-  ctx->registeredMemories = std::move(remoteMemories);
-  ctx->registeredMemories.push_back(localMemory);
-
-  return ctx;
-}
-
-mscclpp::AlgorithmCtxKey AllgatherAlgo6::generateAllgatherContextKey(const void*, void* output, size_t,
-                                                                     mscclpp::DataType) {
-  static int tag = 0;
-  if (disableChannelCache_) {
-    // always return a new key if channel cache is disabled
-    return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, tag++};
-  }
-  size_t recvBytes;
-  CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  return mscclpp::AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
-}
-
-mscclpp::Algorithm AllgatherAlgo6::build() {
-  auto self = std::make_shared<AllgatherAlgo6>();
-  mscclpp::Algorithm allgatherAlgo(
-      "default_allgather6", "allgather",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->allgatherKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateAllgatherContextKey(input, output, count, dtype);
-      });
-  return allgatherAlgo;
-}
-
-void AllgatherAlgo8::initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                                std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  this->conns_ = setupConnections(comm);
-  this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
-  this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
-}
-
-ncclResult_t AllgatherAlgo8::allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
-                                                 void* output, size_t count, mscclpp::DataType dtype,
-                                                 cudaStream_t stream,
-                                                 std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  int rank = ctx->rank;
-  const size_t bytes = count * getDataTypeSize(dtype);
-  const size_t nElem = bytes / sizeof(int);
-  if ((char*)input == (char*)output + rank * bytes) {
-    allgather8<false><<<56, 1024, 0, stream>>>((void*)input, this->scratchBuffer_.get(), (void*)output,
-                                               ctx->memoryChannelDeviceHandles.get(), rank, ctx->nRanksPerNode,
-                                               ctx->workSize, nElem);
-  } else {
-    allgather8<true><<<56, 1024, 0, stream>>>((void*)input, this->scratchBuffer_.get(), (void*)output,
-                                              ctx->memoryChannelDeviceHandles.get(), rank, ctx->nRanksPerNode,
-                                              ctx->workSize, nElem);
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    WARN("AllgatherAlgo8 failed with error %d", err);
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> AllgatherAlgo8::initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
-                                                                            const void* input, void*, size_t count,
-                                                                            mscclpp::DataType dtype) {
-  constexpr int nChannelsPerConnection = 56;
-
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-
-  // setup semaphores
-  ctx->memorySemaphores = std::move(setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection));
-
-  size_t bytes = count * getDataTypeSize(dtype);
-  // register the memory for the broadcast operation
-  mscclpp::RegisteredMemory localMemory = comm->registerMemory((void*)input, bytes, mscclpp::Transport::CudaIpc);
-  mscclpp::RegisteredMemory scratchMemory =
-      comm->registerMemory(this->scratchBuffer_.get(), scratchBufferSize_, mscclpp::Transport::CudaIpc);
-  std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, scratchMemory);
-
-  // setup channels
-  ctx->memoryChannels = std::move(
-      setupMemoryChannels(this->conns_, ctx->memorySemaphores, remoteMemories, localMemory, nChannelsPerConnection));
-  ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
-
-  // keep registered memories reference
-  ctx->registeredMemories = std::move(remoteMemories);
-  ctx->registeredMemories.push_back(localMemory);
-  ctx->registeredMemories.push_back(scratchMemory);
-
-  return ctx;
-}
-
-mscclpp::AlgorithmCtxKey AllgatherAlgo8::generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType) {
-  // always return same key, non-zero copy algo
-  return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
-}
-
-mscclpp::Algorithm AllgatherAlgo8::build() {
-  auto self = std::make_shared<AllgatherAlgo8>();
-  mscclpp::Algorithm allgatherAlgo(
-      "default_allgather8", "allgather",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->allgatherKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateAllgatherContextKey(input, output, count, dtype);
-      });
-  return allgatherAlgo;
-}
--- a/apps/nccl/src/allgather.hpp
+++ b/apps/nccl/src/allgather.hpp
@@ -1,255 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ALLGATHER_HPP_
-#define ALLGATHER_HPP_
-
-#include <mscclpp/nccl.h>
-
-#include <mscclpp/algorithm.hpp>
-#include <mscclpp/concurrency_device.hpp>
-#include <mscclpp/core.hpp>
-#include <mscclpp/executor.hpp>
-#include <mscclpp/gpu.hpp>
-#include <mscclpp/memory_channel.hpp>
-#include <mscclpp/memory_channel_device.hpp>
-
-#include "common.hpp"
-
-template <bool IsOutOfPlace>
-__global__ void __launch_bounds__(1024, 1)
-    allgather6(void* sendbuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, size_t channelOutOffset,
-               size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
-  const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const size_t lid = tid % WARP_SIZE;
-  const size_t wid = tid / WARP_SIZE;
-
-  const size_t nThread = blockDim.x * gridDim.x;
-  const size_t nWarp = nThread / WARP_SIZE;
-  const size_t nPeer = nRanksPerNode - 1;
-  const size_t chanOffset = nPeer * blockIdx.x;
-  auto memChans = memoryChannels + chanOffset;
-
-  if (threadIdx.x < nPeer) {
-    memChans[threadIdx.x].relaxedSignal();
-    memChans[threadIdx.x].wait();
-  }
-  __syncthreads();
-
-  const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
-  const size_t bytes = bytesPerGPU * nPeer;
-  size_t unitBytesPerThread;
-  if (bytes >= nThread * 64) {
-    unitBytesPerThread = 64;
-  } else {
-    unitBytesPerThread = 16;
-  }
-  const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE;
-  const size_t unitBytes = unitBytesPerWarp * nWarp;
-  const size_t nLoop = bytes / unitBytes;
-
-  if (nLoop > 0) {
-    // First loop unrolling
-    const size_t peerIdx = wid % nPeer;
-    const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp;
-    if constexpr (IsOutOfPlace) {
-      char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
-      char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
-      char* buff = reinterpret_cast<char*>(sendbuff);
-      const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp;
-      mscclpp::copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                               WARP_SIZE);
-      mscclpp::copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                               WARP_SIZE);
-    } else {
-      memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
-    }
-  }
-
-  for (size_t i = 1; i < nLoop; ++i) {
-    const size_t gWid = wid + i * nWarp;
-    const size_t peerIdx = gWid % nPeer;
-    const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp;
-    if constexpr (IsOutOfPlace) {
-      char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
-      char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
-      char* buff = reinterpret_cast<char*>(sendbuff);
-      const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
-      mscclpp::copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                               WARP_SIZE);
-      mscclpp::copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                               WARP_SIZE);
-    } else {
-      memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
-    }
-  }
-
-  if (bytes % unitBytes > 0) {
-    const size_t gWid = wid + nLoop * nWarp;
-    const size_t peerIdx = gWid % nPeer;
-    const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
-    const size_t offset = bytesPerGPU * rank + offsetWithinRank;
-    const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU)
-                                   ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
-                                   : unitBytesPerWarp;
-    if (remainBytes > 0) {
-      if constexpr (IsOutOfPlace) {
-        char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
-        char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
-        char* buff = reinterpret_cast<char*>(sendbuff);
-        mscclpp::copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE);
-        mscclpp::copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE);
-      } else {
-        memChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE);
-      }
-    }
-  }
-
-  deviceSyncer.sync(gridDim.x);
-
-  if (threadIdx.x < nPeer) {
-    memChans[threadIdx.x].signal();
-    memChans[threadIdx.x].wait();
-  }
-}
-
-template <bool IsOutOfPlace>
-__global__ void __launch_bounds__(1024, 1)
-    allgather8(void* buff, void* scratch, void* resultBuff,
-               mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, int rank, int nRanksPerNode,
-               [[maybe_unused]] int worldSize, size_t nelems) {
-  const int nPeer = nRanksPerNode - 1;
-  const size_t chanOffset = nPeer * blockIdx.x;
-  // assume (nelems * sizeof(T)) is divisible by 16
-  const size_t nInt4 = nelems * sizeof(int) / sizeof(int4);
-  auto memoryChans = memoryChannels + chanOffset;
-
-  int4* buff4 = reinterpret_cast<int4*>(buff);
-  int4* scratch4 = reinterpret_cast<int4*>(scratch);
-  int4* resultBuff4 = reinterpret_cast<int4*>(resultBuff);
-
-  const size_t unitNInt4 = blockDim.x * gridDim.x;  // The number of int4 transfers at once
-  const size_t nInt4PerChunk = unitNInt4 * 4;       // 4 instructions per thread to make it more efficient
-  const size_t nItrs = nInt4 / nInt4PerChunk;
-  const size_t restNInt4 = nInt4 % nInt4PerChunk;
-  const size_t scratchChunkRankOffset = nInt4PerChunk * rank;
-
-  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
-  const int lid = threadIdx.x % WARP_SIZE;
-  if (lid < nPeer) {
-    channels[lid] = memoryChans[lid];
-  }
-  __syncwarp();
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  // we can use double buffering to hide synchronization overhead
-  for (size_t itr = 0; itr < nItrs; itr++) {
-    if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
-      channels[threadIdx.x].signal();
-      channels[threadIdx.x].wait();
-    }
-    __syncthreads();
-    // Starts allgather
-    for (size_t idx = tid; idx < nInt4PerChunk; idx += blockDim.x * gridDim.x) {
-      int4 val = buff4[itr * nInt4PerChunk + idx];
-      for (int i = 0; i < nPeer; i++) {
-        const int peerIdx = (i + rank) % nPeer;
-        channels[peerIdx].write(idx + scratchChunkRankOffset, val);
-      }
-      if constexpr (IsOutOfPlace) {
-        resultBuff4[nInt4 * rank + idx + itr * nInt4PerChunk] = val;
-      }
-    }
-    // Ensure that all writes of this block have been issued before issuing the signal
-    __syncthreads();
-    if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
-      channels[threadIdx.x].signal();
-      channels[threadIdx.x].wait();
-    }
-    __syncthreads();
-    for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-      const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
-      const int resultOffset = nInt4 * remoteRank + itr * nInt4PerChunk;
-      for (size_t idx = tid; idx < nInt4PerChunk; idx += blockDim.x * gridDim.x) {
-        int4 val = scratch4[nInt4PerChunk * remoteRank + idx];
-        resultBuff4[resultOffset + idx] = val;
-      }
-    }
-  }
-
-  if (restNInt4 > 0) {
-    if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
-      channels[threadIdx.x].signal();
-      channels[threadIdx.x].wait();
-    }
-    __syncthreads();
-    for (size_t idx = tid; idx < restNInt4; idx += blockDim.x * gridDim.x) {
-      int4 val = buff4[nItrs * nInt4PerChunk + idx];
-      for (int i = 0; i < nPeer; i++) {
-        const int peerIdx = (i + rank) % nPeer;
-        channels[peerIdx].write(idx + scratchChunkRankOffset, val);
-      }
-      if constexpr (IsOutOfPlace) {
-        resultBuff4[nInt4 * rank + idx + nItrs * nInt4PerChunk] = val;
-      }
-    }
-    // Ensure that all writes of this block have been issued before issuing the signal
-    __syncthreads();
-    if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
-      channels[threadIdx.x].signal();
-      channels[threadIdx.x].wait();
-    }
-    __syncthreads();
-    for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-      const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
-      const int resultOffset = nInt4 * remoteRank + nItrs * nInt4PerChunk;
-      for (size_t idx = tid; idx < restNInt4; idx += blockDim.x * gridDim.x) {
-        int4 val = scratch4[nInt4PerChunk * remoteRank + idx];
-        resultBuff4[resultOffset + idx] = val;
-      }
-    }
-  }
-}
-
-class AllgatherAlgo6 : public mscclpp::AlgorithmBuilder {
- public:
-  AllgatherAlgo6();
-  mscclpp::Algorithm build() override;
-
- private:
-  bool disableChannelCache_;
-  std::vector<mscclpp::Connection> conns_;
-  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores_;
-  const int nChannelsPerConnection_ = 35;
-
-  void initialize(std::shared_ptr<mscclpp::Communicator> comm, std::unordered_map<std::string, std::shared_ptr<void>>&);
-  ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
-                                   size_t count, mscclpp::DataType dtype, cudaStream_t stream,
-                                   std::unordered_map<std::string, std::shared_ptr<void>>& extras);
-
-  std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*,
-                                                              void* output, size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
-};
-
-class AllgatherAlgo8 : public mscclpp::AlgorithmBuilder {
- public:
-  mscclpp::Algorithm build() override;
-
- private:
-  std::vector<mscclpp::Connection> conns_;
-
-  void initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                  std::unordered_map<std::string, std::shared_ptr<void>>& extras);
-  ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
-                                   size_t count, mscclpp::DataType dtype, cudaStream_t stream,
-                                   std::unordered_map<std::string, std::shared_ptr<void>>& extras);
-
-  std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*,
-                                                              void* output, size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
-
-  size_t scratchBufferSize_;
-  std::shared_ptr<char> scratchBuffer_;
-};
-
-#endif  // ALLGATHER_HPP_
--- a/apps/nccl/src/allreduce.cu
+++ b/apps/nccl/src/allreduce.cu
@@ -1,708 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <mscclpp/nccl.h>
-
-#include <mscclpp/algorithm.hpp>
-#include <mscclpp/env.hpp>
-#include <mscclpp/gpu.hpp>
-#include <mscclpp/gpu_utils.hpp>
-
-#include "allreduce.hpp"
-#include "datatype_conversion.hpp"
-#include "debug.h"
-
-using AllreduceFunc =
-    std::function<cudaError_t(const void*, void*, void*, void*, void*, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
-                              mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
-                              size_t, cudaStream_t, uint32_t*, uint32_t*, uint32_t*, uint32_t)>;
-
-namespace {
-
-template <Op OpType, typename T>
-struct AllpairAdapter {
-  static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t channelInOffset, size_t,
-                          size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t nelems,
-                          cudaStream_t stream, uint32_t* deviceFlag7, uint32_t* deviceFlag28, uint32_t* deviceFlag56,
-                          uint32_t numScratchBuff) {
-    using ChannelType = mscclpp::DeviceHandle<mscclpp::MemoryChannel>;
-    if (sizeof(T) * nelems < worldSize * sizeof(int)) {
-      int nBlocks = worldSize - 1;
-      int nThreadsPerBlock = 32;
-      allreduceAllPairs<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-          (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-          nRanksPerNode, worldSize, nelems, deviceFlag7, numScratchBuff);
-    } else if (sizeof(T) * nelems <= (1 << 14)) {
-      int nBlocks = (worldSize - 1) * 4;
-      int nThreadsPerBlock = 512;
-      allreduceAllPairs<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-          (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-          nRanksPerNode, worldSize, nelems, deviceFlag28, numScratchBuff);
-    } else if (sizeof(T) * nelems <= (1 << 20)) {
-      int nBlocks = (nRanksPerNode - 1) * 4;
-      int nThreadsPerBlock = 1024;
-      uint32_t* deviceFlag = deviceFlag28;
-      if (nelems >= 8192) {
-        nBlocks = (worldSize - 1) * 8;
-        nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
-        deviceFlag = deviceFlag56;
-
-#if defined(__HIP_PLATFORM_AMD__)
-        size_t sizeBytes = sizeof(T) * nelems;
-        if constexpr (std::is_same_v<T, __half>) {
-          // Half-specific tuning for 32KB-256KB range
-          if (sizeBytes == (32 << 10)) {
-            nThreadsPerBlock = 64;
-          } else if (sizeBytes >= (64 << 10) && sizeBytes <= (256 << 10)) {
-            nThreadsPerBlock = 128;
-          }
-        }
-
-#if defined(__FP8_TYPES_EXIST__)
-        // FP8-specific tuning for 32KB-256KB range
-        if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
-          if (sizeBytes == (32 << 10)) {
-            nThreadsPerBlock = 64;
-          } else if (sizeBytes == (64 << 10)) {
-            nThreadsPerBlock = 128;
-          } else if (sizeBytes >= (128 << 10) && sizeBytes <= (256 << 10)) {
-            nThreadsPerBlock = 256;
-          }
-        }
-#endif
-#endif
-      }
-#if defined(ENABLE_NPKIT)
-      size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
-      allreduce7<OpType><<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(
-          (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-          nRanksPerNode, worldSize, nelems, deviceFlag, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
-          NpKit::GetCpuTimestamp());
-#else
-      allreduce7<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-          (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-          nRanksPerNode, worldSize, nelems, deviceFlag, numScratchBuff);
-#endif
-    }
-    return cudaGetLastError();
-  }
-};
-
-template <Op OpType, typename T>
-struct NvlsAdapter {
-  static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
-                          size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t nelems,
-                          cudaStream_t stream, uint32_t*, uint32_t*, uint32_t*, uint32_t) {
-#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
-      return cudaErrorNotSupported;
-    } else
-#endif
-    {
-      using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
-      int nBlocks = nRanksPerNode;
-      int nThreadsPerBlock = 1024;
-      allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
-                                                              nvlsOutChannels, channelInOffset, channelOutOffset,
-                                                              nelems * sizeof(T), rank, nRanksPerNode);
-      return cudaGetLastError();
-    }
-  }
-};
-
-template <Op OpType, typename T>
-struct NvlsWithCopyAdapter {
-  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
-                          int rank, int nRanksPerNode, int, size_t nelems, cudaStream_t stream, uint32_t*, uint32_t*,
-                          uint32_t*, uint32_t) {
-#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
-      return cudaErrorNotSupported;
-    } else
-#endif
-    {
-      using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
-      if (sizeof(T) * nelems < (1 << 24)) {
-        int nBlocks = nRanksPerNode * 4;
-        int nThreadsPerBlock = 1024;
-        allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                                 nvlsChannels, nelems * sizeof(T), scratchBufferSize,
-                                                                 rank, nRanksPerNode);
-      } else {
-        int nBlocks = nRanksPerNode * 5;
-        int nThreadsPerBlock = 1024;
-        allreduce11<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                                 nvlsChannels, nelems * sizeof(T), scratchBufferSize,
-                                                                 rank, nRanksPerNode);
-      }
-      return cudaGetLastError();
-    }
-  }
-};
-
-template <Op OpType, typename T>
-struct Allreduce8Adapter {
-  static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels,
-                          void* memoryOutChannels, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t channelOutOffset, size_t,
-                          int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream, uint32_t*,
-                          uint32_t*, uint32_t*, uint32_t) {
-    using ChannelType = mscclpp::DeviceHandle<mscclpp::MemoryChannel>;
-    int nBlocks = (nRanksPerNode - 1) * 5;
-    int nThreadsPerBlock = 512;
-    allreduce8<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-        (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
-        channelOutOffset, 0, rank, nRanksPerNode, worldSize, nelems);
-    return cudaGetLastError();
-  }
-};
-
-template <Op OpType, typename T>
-struct AllreduceNvlsPacketAdapter {
-  static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
-                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
-                          int rank, int, int worldSize, size_t nelems, cudaStream_t stream, uint32_t* deviceFlag,
-                          uint32_t*, uint32_t*, uint32_t) {
-    size_t size = nelems * sizeof(T);
-    int nBlocks = 8;
-    int nThreadsPerBlock = 1024;
-    if (size <= (1 << 13)) {
-      nBlocks = 4;
-      nThreadsPerBlock = 512;
-    }
-    allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-        (const T*)input, (T*)scratch, (T*)output, nvlsChannels, nelems, scratchBufferSize, rank, worldSize, deviceFlag);
-    return cudaGetLastError();
-  }
-};
-
-template <template <Op, typename> class Adapter>
-AllreduceFunc dispatch(ncclRedOp_t op, mscclpp::DataType dtype) {
-  Op reduceOp = getReduceOp(op);
-
-  if (reduceOp == SUM) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<SUM, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<SUM, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<SUM, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FP8_E4M3) {
-      return Adapter<SUM, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FP8_E5M2) {
-      return Adapter<SUM, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<SUM, int>::call;
-    } else {
-      return nullptr;
-    }
-  } else if (reduceOp == MIN) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<MIN, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<MIN, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<MIN, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FP8_E4M3) {
-      return Adapter<MIN, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FP8_E5M2) {
-      return Adapter<MIN, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<MIN, int>::call;
-    } else {
-      return nullptr;
-    }
-  }
-  return nullptr;
-}
-}  // namespace
-
-enum Op getReduceOp(ncclRedOp_t op) {
-  switch (op) {
-    case ncclSum:
-      return SUM;
-    case ncclMin:
-      return MIN;
-    default:
-      WARN("op is invalid, op: %d", op);
-      throw mscclpp::Error("Invalid operation", mscclpp::ErrorCode::InternalError);
-  }
-}
-
-void AllreducePacket::initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                                 std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
-  scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
-  deviceFlag7_ = mscclpp::detail::gpuCallocShared<uint32_t>(7);
-  deviceFlag28_ = mscclpp::detail::gpuCallocShared<uint32_t>(28);
-  deviceFlag56_ = mscclpp::detail::gpuCallocShared<uint32_t>(56);
-  std::vector<uint32_t> initFlag(56);
-  for (int i = 0; i < 56; ++i) {
-    initFlag[i] = 1;
-  }
-  mscclpp::gpuMemcpy<uint32_t>(deviceFlag7_.get(), initFlag.data(), 7, cudaMemcpyHostToDevice);
-  mscclpp::gpuMemcpy<uint32_t>(deviceFlag28_.get(), initFlag.data(), 28, cudaMemcpyHostToDevice);
-  mscclpp::gpuMemcpy<uint32_t>(deviceFlag56_.get(), initFlag.data(), 56, cudaMemcpyHostToDevice);
-  this->conns_ = setupConnections(comm);
-}
-
-ncclResult_t AllreducePacket::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
-                                                  void* output, size_t count, mscclpp::DataType dtype,
-                                                  cudaStream_t stream,
-                                                  std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  ncclRedOp_t op = *static_cast<ncclRedOp_t*>(extras.at("op").get());
-
-  size_t sendBytes;
-  CUdeviceptr sendBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
-  size_t channelInOffset = (char*)input - (char*)sendBasePtr;
-
-  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
-  if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
-    return ncclInvalidArgument;
-  }
-  cudaError_t error = allreduce(input, this->scratchBuffer_.get(), output, ctx->memoryChannelDeviceHandles.get(),
-                                nullptr, nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, ctx->rank,
-                                ctx->nRanksPerNode, ctx->workSize, count, stream, deviceFlag7_.get(),
-                                deviceFlag28_.get(), deviceFlag56_.get(), this->nSegmentsForScratchBuffer_);
-  if (error != cudaSuccess) {
-    WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> AllreducePacket::initAllreduceContext(
-    std::shared_ptr<mscclpp::Communicator> comm, const void* input, void*, size_t, mscclpp::DataType) {
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  const int nChannelsPerConnection = 56;
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-  if (this->ctx_ == nullptr) {
-    // setup semaphores
-    ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
-    // setup registered memories
-    mscclpp::RegisteredMemory scratchMemory =
-        comm->registerMemory(this->scratchBuffer_.get(), this->scratchBufferSize_, mscclpp::Transport::CudaIpc);
-    std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, scratchMemory);
-    ctx->registeredMemories = std::move(remoteMemories);
-    ctx->registeredMemories.push_back(scratchMemory);
-  } else {
-    ctx->memorySemaphores = ctx_->memorySemaphores;
-    ctx->registeredMemories = ctx_->registeredMemories;
-    ctx->registeredMemories.pop_back();  // remove the local memory from previous context
-  }
-
-  size_t sendBytes;
-  CUdeviceptr sendBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
-  mscclpp::RegisteredMemory localMemory =
-      comm->registerMemory((void*)sendBasePtr, sendBytes, mscclpp::Transport::CudaIpc);
-
-  // setup channels
-  ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
-                                            nChannelsPerConnection);
-  ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
-  ctx->registeredMemories.emplace_back(localMemory);
-
-  this->ctx_ = ctx;
-
-  return ctx;
-}
-
-mscclpp::AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t,
-                                                                      mscclpp::DataType) {
-  size_t sendBytes;
-  CUdeviceptr sendBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
-  return mscclpp::AlgorithmCtxKey{(void*)sendBasePtr, nullptr, sendBytes, 0, 0};
-}
-
-mscclpp::Algorithm AllreducePacket::build() {
-  auto self = std::make_shared<AllreducePacket>();
-  mscclpp::Algorithm allreduceAlgo(
-      "default_allreduce_packet", "allreduce",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, count, dtype);
-      });
-  return allreduceAlgo;
-}
-
-void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                               std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  nSwitchChannels_ = 8;
-  this->conns_ = setupConnections(comm);
-  // setup semaphores
-  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
-      setupMemorySemaphores(comm, this->conns_, nSwitchChannels_);
-  // setup base memory channels
-  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nSwitchChannels_);
-  this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
-}
-
-ncclResult_t AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
-                                                void* output, size_t count, mscclpp::DataType dtype,
-                                                cudaStream_t stream,
-                                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  AllreduceFunc allreduce = dispatch<NvlsAdapter>(ncclSum, dtype);
-  if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
-    return ncclInvalidArgument;
-  }
-  size_t sendBytes, recvBytes;
-  CUdeviceptr sendBasePtr, recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  size_t channelInOffset = (char*)input - (char*)sendBasePtr;
-  size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
-  mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels = ctx->switchChannelDeviceHandles.get();
-  mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels = ctx->switchChannelDeviceHandles.get();
-  if (input != output) {
-    nvlsOutChannels = nvlsOutChannels + nSwitchChannels_;
-  }
-  cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr,
-                                nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank,
-                                ctx->nRanksPerNode, ctx->workSize, count, stream, nullptr, nullptr, nullptr, 0);
-  if (error != cudaSuccess) {
-    WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
-                                                                    mscclpp::DataType) {
-  size_t sendBytes, recvBytes;
-  CUdeviceptr sendBasePtr, recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  return mscclpp::AlgorithmCtxKey{(void*)sendBasePtr, (void*)recvBasePtr, sendBytes, recvBytes, 0};
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm,
-                                                                           const void* input, void* output, size_t,
-                                                                           mscclpp::DataType) {
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-
-  size_t sendBytes, recvBytes;
-  CUdeviceptr sendBasePtr, recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-
-  // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
-  ctx->switchChannels = setupNvlsChannels(ctx->nvlsConnections, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
-  if (input != output) {
-    auto nvlsOutConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
-    std::vector<mscclpp::SwitchChannel> outChannels =
-        setupNvlsChannels(nvlsOutConnections, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
-    ctx->nvlsConnections.insert(ctx->nvlsConnections.end(), nvlsOutConnections.begin(), nvlsOutConnections.end());
-    ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end());
-  }
-
-  ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
-  return ctx;
-}
-
-mscclpp::Algorithm AllreduceNvls::build() {
-  auto self = std::make_shared<AllreduceNvls>();
-  mscclpp::Algorithm allreduceAlgo(
-      "default_allreduce_nvls", "allreduce",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, count, dtype);
-      });
-  return allreduceAlgo;
-}
-
-void AllreduceNvlsWithCopy::initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                                       std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  nSwitchChannels_ = 8;
-  int nBaseChannels = 64;
-  scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
-  scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
-  this->conns_ = setupConnections(comm);
-  // setup semaphores
-  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
-      setupMemorySemaphores(comm, this->conns_, nBaseChannels);
-  // setup base memory channels
-  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
-  this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
-}
-
-ncclResult_t AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx,
-                                                        const void* input, void* output, size_t count,
-                                                        mscclpp::DataType dtype, cudaStream_t stream,
-                                                        std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  AllreduceFunc allreduce = dispatch<NvlsWithCopyAdapter>(ncclSum, dtype);
-  if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
-    return ncclInvalidArgument;
-  }
-  cudaError_t error =
-      allreduce(input, this->scratchBuffer_.get(), output, this->memoryChannelsDeviceHandle_.get(), nullptr,
-                ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, ctx->rank,
-                ctx->nRanksPerNode, ctx->workSize, count, stream, nullptr, nullptr, nullptr, 0);
-  if (error != cudaSuccess) {
-    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-mscclpp::AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t,
-                                                                            mscclpp::DataType) {
-  return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvlsWithCopy::initAllreduceContext(
-    std::shared_ptr<mscclpp::Communicator> comm, const void*, void*, size_t, mscclpp::DataType) {
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-
-  // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
-  ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_.get(), scratchBufferSize_, nSwitchChannels_);
-  ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
-  return ctx;
-}
-
-mscclpp::Algorithm AllreduceNvlsWithCopy::build() {
-  auto self = std::make_shared<AllreduceNvlsWithCopy>();
-  mscclpp::Algorithm allreduceAlgo(
-      "default_allreduce_nvls_with_copy", "allreduce",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, count, dtype);
-      });
-  return allreduceAlgo;
-}
-
-void Allreduce8::initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                            std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
-  this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
-  this->conns_ = setupConnections(comm);
-  nChannelsPerConnection_ = 64;
-  comm_ = comm;
-  // setup semaphores
-  this->outputSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
-  this->inputScratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
-  mscclpp::RegisteredMemory localMemory =
-      comm->registerMemory(scratchBuffer_.get(), scratchBufferSize_, mscclpp::Transport::CudaIpc);
-  this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory);
-  localScratchMemory_ = std::move(localMemory);
-}
-
-ncclResult_t Allreduce8::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
-                                             void* output, size_t count, mscclpp::DataType dtype, cudaStream_t stream,
-                                             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  const size_t bytes = count * getDataTypeSize(dtype);
-  ncclRedOp_t op = *static_cast<ncclRedOp_t*>(extras.at("op").get());
-
-  size_t recvBytes;
-  CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
-  std::shared_ptr<mscclpp::MemoryChannel::DeviceHandle> inputChannelHandles;
-  if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
-    inputChannelHandles = this->memoryChannelsMap_[input].second;
-  } else {
-    mscclpp::RegisteredMemory localMemory =
-        comm_->registerMemory(const_cast<void*>(input), bytes, mscclpp::Transport::CudaIpc);
-    std::vector<mscclpp::MemoryChannel> channels =
-        setupMemoryChannels(this->conns_, this->inputScratchSemaphores_, this->remoteScratchMemories_, localMemory,
-                            nChannelsPerConnection_);
-    this->memoryChannelsMap_[input] = std::make_pair(channels, setupMemoryChannelDeviceHandles(channels));
-  }
-  inputChannelHandles = this->memoryChannelsMap_[input].second;
-
-  AllreduceFunc allreduce = dispatch<Allreduce8Adapter>(op, dtype);
-  if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
-    return ncclInvalidArgument;
-  }
-  cudaError_t error =
-      allreduce(input, this->scratchBuffer_.get(), output, inputChannelHandles.get(),
-                ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank,
-                ctx->nRanksPerNode, ctx->workSize, count, stream, nullptr, nullptr, nullptr, 0);
-  if (error != cudaSuccess) {
-    WARN("Allreduce8 failed with error: %s", cudaGetErrorString(error));
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-mscclpp::AlgorithmCtxKey Allreduce8::generateAllreduceContextKey(const void*, void* output, size_t, mscclpp::DataType) {
-  static int tag = 0;
-  size_t recvBytes;
-  CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  if (mscclpp::env()->disableChannelCache) {
-    return mscclpp::AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++};
-  }
-  return mscclpp::AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> Allreduce8::initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm,
-                                                                        const void*, void* output, size_t,
-                                                                        mscclpp::DataType) {
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-
-  // setup semaphores
-  ctx->memorySemaphores = this->outputSemaphores_;
-  // setup memories and channels
-  size_t recvBytes;
-  CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  mscclpp::RegisteredMemory localMemory =
-      comm->registerMemory((void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
-  ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
-  ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
-                                            nChannelsPerConnection_);
-  ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
-  return ctx;
-}
-
-mscclpp::Algorithm Allreduce8::build() {
-  auto self = std::make_shared<Allreduce8>();
-  mscclpp::Algorithm allreduceAlgo(
-      "default_allreduce_allreduce8", "allreduce",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, count, dtype);
-      });
-  return allreduceAlgo;
-}
-
-void AllreduceNvlsPacket::initialize(std::shared_ptr<mscclpp::Communicator>,
-                                     std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
-  this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
-  deviceFlag_ = mscclpp::detail::gpuCallocShared<uint32_t>(16);
-  std::vector<uint32_t> initFlag(16);
-  for (int i = 0; i < 16; ++i) {
-    initFlag[i] = 1;
-  }
-  mscclpp::gpuMemcpy<uint32_t>(deviceFlag_.get(), initFlag.data(), 16, cudaMemcpyHostToDevice);
-}
-
-mscclpp::AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t,
-                                                                          mscclpp::DataType) {
-  return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvlsPacket::initAllreduceContext(
-    std::shared_ptr<mscclpp::Communicator> comm, const void*, void*, size_t, mscclpp::DataType) {
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-
-  // setup channels
-  int nSwitchChannels = 1;
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
-  ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_.get(), this->scratchBufferSize_, nSwitchChannels);
-  ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
-  return ctx;
-}
-
-ncclResult_t AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx,
-                                                      const void* input, void* output, size_t count,
-                                                      mscclpp::DataType dtype, cudaStream_t stream,
-                                                      std::unordered_map<std::string, std::shared_ptr<void>>& extra) {
-  int op = *static_cast<int*>(extra.at("op").get());
-  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(static_cast<ncclRedOp_t>(op), dtype);
-  if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
-    return ncclInvalidArgument;
-  }
-  cudaError_t error =
-      allreduce(input, this->scratchBuffer_.get(), output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(),
-                nullptr, 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, count, stream,
-                this->deviceFlag_.get(), nullptr, nullptr, 0);
-  if (error != cudaSuccess) {
-    WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-mscclpp::Algorithm AllreduceNvlsPacket::build() {
-  auto self = std::make_shared<AllreduceNvlsPacket>();
-  mscclpp::Algorithm allreduceAlgo(
-      "default_allreduce_nvls_packet", "allreduce",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, count, dtype);
-      });
-  return allreduceAlgo;
-}
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
--- a/apps/nccl/src/broadcast.cu
+++ b/apps/nccl/src/broadcast.cu
@@ -1,97 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <mscclpp/nccl.h>
-
-#include <mscclpp/algorithm.hpp>
-#include <mscclpp/gpu_utils.hpp>
-
-#include "broadcast.hpp"
-#include "datatype_conversion.hpp"
-
-void BroadcastAlgo6::initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                                std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  this->conns_ = setupConnections(comm);
-  this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
-  this->scratchMemSize_ = *(size_t*)(extras.at("scratch_size").get());
-}
-
-ncclResult_t BroadcastAlgo6::broadcastKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
-                                                 void* output, size_t count, mscclpp::DataType dtype,
-                                                 cudaStream_t stream,
-                                                 std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  int root = *(int*)extras.at("root").get();
-  const size_t elemSize = getDataTypeSize(dtype);
-  cudaError_t err;
-  if (input == output) {
-    err = broadcast<false>((int*)input, (int*)this->scratchBuffer_.get(), (int*)output,
-                           ctx->memoryChannelDeviceHandles.get(), 0, ctx->rank, ctx->nRanksPerNode, root, ctx->workSize,
-                           count * elemSize / sizeof(int), stream);
-  } else {
-    err = broadcast<true>((int*)input, (int*)this->scratchBuffer_.get(), (int*)output,
-                          ctx->memoryChannelDeviceHandles.get(), 0, ctx->rank, ctx->nRanksPerNode, root, ctx->workSize,
-                          count * elemSize / sizeof(int), stream);
-  }
-  if (err != cudaSuccess) {
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-std::shared_ptr<mscclpp::AlgorithmCtx> BroadcastAlgo6::initBroadcastContext(std::shared_ptr<mscclpp::Communicator> comm,
-                                                                            const void*, void* output, size_t,
-                                                                            mscclpp::DataType) {
-  constexpr int nChannelsPerConnection = 8;
-
-  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-  ctx->rank = comm->bootstrap()->getRank();
-  ctx->workSize = comm->bootstrap()->getNranks();
-  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
-
-  // setup semaphores
-  ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
-  size_t recvBytes;
-  CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-
-  // register the memory for the broadcast operation
-  mscclpp::RegisteredMemory localMemory =
-      comm->registerMemory((void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
-  mscclpp::RegisteredMemory localScratchMemory =
-      comm->registerMemory(this->scratchBuffer_.get(), scratchMemSize_, mscclpp::Transport::CudaIpc);
-  std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, localScratchMemory);
-  ctx->memoryChannels =
-      setupMemoryChannels(this->conns_, ctx->memorySemaphores, remoteMemories, localMemory, nChannelsPerConnection);
-  ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
-
-  // keep registered memories reference
-  ctx->registeredMemories = std::move(remoteMemories);
-  ctx->registeredMemories.push_back(localMemory);
-  ctx->registeredMemories.push_back(localScratchMemory);
-
-  return ctx;
-}
-
-mscclpp::AlgorithmCtxKey BroadcastAlgo6::generateBroadcastContextKey(const void*, void*, size_t, mscclpp::DataType) {
-  // always use same context
-  return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
-}
-
-mscclpp::Algorithm BroadcastAlgo6::build() {
-  auto self = std::make_shared<BroadcastAlgo6>();
-  mscclpp::Algorithm broadcastAlgo(
-      "default_broadcast6", "broadcast",
-      [self](std::shared_ptr<mscclpp::Communicator> comm,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
-      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype, cudaStream_t stream,
-             std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-        return self->broadcastKernelFunc(ctx, input, output, count, dtype, stream, extras);
-      },
-      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-             mscclpp::DataType dtype) { return self->initBroadcastContext(comm, input, output, count, dtype); },
-      [self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
-        return self->generateBroadcastContextKey(input, output, count, dtype);
-      });
-  return broadcastAlgo;
-}
--- a/apps/nccl/src/broadcast.hpp
+++ b/apps/nccl/src/broadcast.hpp
@@ -1,180 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef BROADCAST_HPP_
-#define BROADCAST_HPP_
-
-#include <mscclpp/nccl.h>
-
-#include <mscclpp/algorithm.hpp>
-#include <mscclpp/concurrency_device.hpp>
-#include <mscclpp/core.hpp>
-#include <mscclpp/executor.hpp>
-#include <mscclpp/gpu.hpp>
-#include <mscclpp/memory_channel.hpp>
-#include <mscclpp/memory_channel_device.hpp>
-
-#include "common.hpp"
-
-template <bool IsOutOfPlace>
-__global__ void __launch_bounds__(1024, 1)
-    broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff,
-               mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, [[maybe_unused]] size_t channelOutOffset,
-               size_t rank, [[maybe_unused]] size_t worldSize, size_t root, size_t nRanksPerNode, size_t nelemsPerGPU) {
-  const size_t nThread = blockDim.x * gridDim.x;
-  const size_t nPeer = nRanksPerNode - 1;
-  const size_t chanOffset = nPeer * blockIdx.x;
-
-  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> memChans[MAX_NRANKS_PER_NODE - 1];
-  if (threadIdx.x < nPeer) {
-    memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x];
-    memChans[threadIdx.x].relaxedSignal();
-    memChans[threadIdx.x].wait();
-  }
-  __syncthreads();
-
-  const size_t peerRootIdx = (root == rank) ? nPeer : ((root < rank) ? root : (root - 1));
-
-  const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
-  const size_t bytes = bytesPerGPU;
-  size_t unitBytesPerThread;
-  if (bytes * nPeer >= nThread * 64) {
-    unitBytesPerThread = 64;
-  } else {
-    unitBytesPerThread = 16;
-  }
-  const size_t unitBytesPerBlock = unitBytesPerThread * blockDim.x;
-  const size_t unitBytes = unitBytesPerBlock * gridDim.x;
-  const size_t nLoop = bytes / unitBytes;
-
-  const size_t maxScratchSizeToUse = (SCRATCH_SIZE - unitBytes);
-  const size_t nLoopToSync = (maxScratchSizeToUse / unitBytes) + 1;
-
-  size_t scratchSub = 0;
-
-  // First loop will always fit the scratch size.
-  if (nLoop > 0) {
-    // First loop unrolling
-    const size_t offset = blockIdx.x * unitBytesPerBlock;
-    if (rank == root) {
-      char* send_ = reinterpret_cast<char*>(sendbuff);
-      for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-        char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);  // Peer's scratchbuff.
-        mscclpp::copy<16, false>(dst + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
-        __syncthreads();
-        if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
-      }
-      if constexpr (IsOutOfPlace) {
-        char* recv_ = reinterpret_cast<char*>(recvbuff);
-        mscclpp::copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
-      }
-
-    } else {  // rank != root.
-      if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
-      __syncthreads();
-      char* recv_ = reinterpret_cast<char*>(recvbuff);
-      char* scratch_ = reinterpret_cast<char*>(scratchbuff);  // My scratchbuff.
-      mscclpp::copy<16, false>(recv_ + offset, scratch_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
-    }
-  }
-
-  for (size_t i = 1; i < nLoop; ++i) {
-    const size_t offset = blockIdx.x * unitBytesPerBlock + i * unitBytes;
-    if (i % nLoopToSync == 0) {  // Sync to reuse scratch buff
-      scratchSub = -i * unitBytes;
-      deviceSyncer.sync(gridDim.x);
-      if (threadIdx.x < nPeer) {
-        memChans[threadIdx.x].relaxedSignal();
-        memChans[threadIdx.x].wait();
-      }
-    }
-    if (rank == root) {
-      char* send_ = reinterpret_cast<char*>(sendbuff);
-      for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-        char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);  // Peer's scratchbuff.
-        mscclpp::copy<16, false>(dst + offset + scratchSub, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
-        __syncthreads();
-        if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
-      }
-      if constexpr (IsOutOfPlace) {
-        char* recv_ = reinterpret_cast<char*>(recvbuff);
-        mscclpp::copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
-      }
-    } else {  // rank != root.
-      if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
-      __syncthreads();
-      char* recv_ = reinterpret_cast<char*>(recvbuff);
-      char* scratch_ = reinterpret_cast<char*>(scratchbuff);  // My scratchbuff.
-      mscclpp::copy<16, false>(recv_ + offset, scratch_ + offset + scratchSub, unitBytesPerBlock, threadIdx.x,
-                               blockDim.x);
-    }
-  }
-
-  // Remainder loop will also fit the scratch buff since we subtract unitBytes from SCRATCH_SIZE.
-  if (bytes % unitBytes > 0) {  // remainder.
-    const size_t offset = blockIdx.x * unitBytesPerBlock + nLoop * unitBytes;
-    const size_t remainBytes = (offset < bytes) ? (bytes - offset) : 0;
-    if (remainBytes > 0) {
-      if (rank == root) {
-        char* send_ = reinterpret_cast<char*>(sendbuff);
-        for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-          char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);  // Peer's scratchbuff.
-          mscclpp::copy<16, true>(dst + offset + scratchSub, send_ + offset, remainBytes, threadIdx.x, blockDim.x);
-          __syncthreads();
-          if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
-        }
-        if constexpr (IsOutOfPlace) {
-          char* recv_ = reinterpret_cast<char*>(recvbuff);
-          mscclpp::copy<16, true>(recv_ + offset, send_ + offset, remainBytes, threadIdx.x, blockDim.x);
-        }
-      } else {  // rank != root.
-        if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
-        __syncthreads();
-        char* recv_ = reinterpret_cast<char*>(recvbuff);
-        char* scratch_ = reinterpret_cast<char*>(scratchbuff);  // My scratchbuff.
-        mscclpp::copy<16, true>(recv_ + offset, scratch_ + offset + scratchSub, remainBytes, threadIdx.x, blockDim.x);
-      }
-    }  // remainBytes > 0.
-  }
-
-  deviceSyncer.sync(gridDim.x);
-
-  if (threadIdx.x < nPeer) {
-    memChans[threadIdx.x].relaxedSignal();
-    memChans[threadIdx.x].wait();
-  }
-}
-
-template <bool IsOutOfPlace, typename T>
-cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
-                      size_t channelOutOffset, int rank, int nRanksPerNode, int root, int worldSize, size_t nelems,
-                      cudaStream_t stream) {
-  int nBlocks = 7;
-  broadcast6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, (void*)scratch, (void*)resultBuff, memoryChannels,
-                                                         channelOutOffset, rank, worldSize, root, nRanksPerNode,
-                                                         nelems * sizeof(T) / sizeof(int));
-  return cudaGetLastError();
-}
-
-class BroadcastAlgo6 : public mscclpp::AlgorithmBuilder {
- public:
-  BroadcastAlgo6() = default;
-  mscclpp::Algorithm build() override;
-
- private:
-  void initialize(std::shared_ptr<mscclpp::Communicator> comm,
-                  std::unordered_map<std::string, std::shared_ptr<void>>& extras);
-  ncclResult_t broadcastKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
-                                   size_t count, mscclpp::DataType dtype, cudaStream_t stream,
-                                   std::unordered_map<std::string, std::shared_ptr<void>>& extras);
-
-  std::shared_ptr<mscclpp::AlgorithmCtx> initBroadcastContext(std::shared_ptr<mscclpp::Communicator> comm, const void*,
-                                                              void* output, size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateBroadcastContextKey(const void*, void*, size_t, mscclpp::DataType);
-
-  std::vector<mscclpp::Connection> conns_;
-  size_t scratchMemSize_;
-  std::shared_ptr<char> scratchBuffer_;
-};
-
-#endif  // BROADCAST_HPP_
--- a/apps/nccl/src/common.hpp
+++ b/apps/nccl/src/common.hpp
@@ -1,67 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef NCCL_COMMON_HPP_
-#define NCCL_COMMON_HPP_
-
-#include <mscclpp/concurrency_device.hpp>
-#include <mscclpp/core.hpp>
-#include <mscclpp/env.hpp>
-#include <mscclpp/memory_channel.hpp>
-#include <mscclpp/switch_channel.hpp>
-#include <vector>
-
-#if defined(__HIP_PLATFORM_AMD__)
-#define WARP_SIZE 64
-#define __syncwarp() __builtin_amdgcn_wave_barrier()
-#else
-#define WARP_SIZE 32
-#endif
-
-constexpr int NUM_NVLS_CONNECTION = 8;
-constexpr int NUM_SEMAPHORES = 64;
-
-constexpr int MAX_NRANKS_PER_NODE = 8;
-
-constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
-static bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache;
-
-__device__ mscclpp::DeviceSyncer deviceSyncer;
-__constant__ mscclpp::DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES];
-
-std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
-                                                           mscclpp::RegisteredMemory localMemory);
-
-std::vector<mscclpp::MemoryChannel> setupMemoryChannels(
-    const std::vector<mscclpp::Connection>& connections,
-    const std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>& memorySemaphores,
-    const std::vector<mscclpp::RegisteredMemory>& remoteMemories, mscclpp::RegisteredMemory localMemory,
-    int nChannelsPerConnection);
-
-std::vector<mscclpp::Connection> setupConnections(std::shared_ptr<mscclpp::Communicator> comm);
-
-std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> setupMemorySemaphores(
-    std::shared_ptr<mscclpp::Communicator> comm, const std::vector<mscclpp::Connection>& connections,
-    int nChannelsPerConnection);
-
-std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> setupMemoryChannelDeviceHandles(
-    const std::vector<mscclpp::MemoryChannel>& memoryChannels);
-
-std::vector<std::shared_ptr<mscclpp::NvlsConnection>> setupNvlsConnections(std::shared_ptr<mscclpp::Communicator> comm,
-                                                                           size_t size, int numConnections);
-
-std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::vector<std::shared_ptr<mscclpp::NvlsConnection>> conns,
-                                                      void* buffer, size_t bufferSize, int nSwitchChannels);
-
-std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SwitchChannel>> setupNvlsChannelDeviceHandles(
-    const std::vector<mscclpp::SwitchChannel>& nvlsChannels);
-
-std::vector<mscclpp::BaseMemoryChannel> setupBaseMemoryChannels(
-    const std::vector<mscclpp::Connection>& connections,
-    const std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>& memorySemaphores,
-    int nChannelsPerConnection);
-
-std::shared_ptr<mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>> setupBaseMemoryChannelDeviceHandles(
-    const std::vector<mscclpp::BaseMemoryChannel>& baseMemoryChannels);
-
-#endif  // NCCL_COMMON_HPP_
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 # Configuration file for the Sphinx documentation builder.
 #
 # For the full list of built-in configuration values, see the documentation:
--- a/docs/dsl/integration.md
+++ b/docs/dsl/integration.md
@@ -102,7 +102,7 @@ Key file: `dsl_with_nccl_api.py`.
 #### Launch with interposition
 To run with NCCL interposition, you preload the MSCCL++ shim so it transparently intercepts NCCL calls made by PyTorch’s nccl backend.
 ```bash
-LD_PRELOAD=<MSCCLPP_REPO>/build/apps/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
+LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
 ```
 ## Notices:
 - When using NCCL interposition, the algorithm selection order is:
--- a/docs/figs/customize_algo.png
+++ b/docs/figs/customize_algo.png
--- a/docs/guide/cpp-examples.md
+++ b/docs/guide/cpp-examples.md
@@ -19,7 +19,7 @@ $ make -j allgather_test_perf allreduce_test_perf
 For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu).

 ```bash
-$ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5
+$ mpirun --bind-to numa -np 8 ./bin/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5
 ```

 *NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.*
@@ -27,7 +27,7 @@ $ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 4
 Check the help message for more details.

 ```bash
-$ ./test/mscclpp-test/allreduce_test_perf --help
+$ ./bin/allreduce_test_perf --help
 USAGE: allreduce_test_perf
        [-b,--minbytes <min size in bytes>]
        [-e,--maxbytes <max size in bytes>]
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -0,0 +1,470 @@
+# MSCCL++ Torch Integration Guide
+
+This guide shows how to use MSCCL++ with PyTorch for high-performance collective communication.
+
+## Quick Start
+
+MSCCL++ provides three ways to implement collective algorithms with PyTorch:
+
+| Approach | Best For | Complexity |
+|----------|----------|------------|
+| **Default Algorithms** | Quick integration, standard use cases | Low |
+| **DSL-based** | Custom communication patterns without C++ | Medium |
+| **Native C++/CUDA** | Maximum control, custom kernels | High |
+
+```{figure} ../figs/customize_algo.png
+:name: MSCCL++ Customization Algorithm Selection
+:alt: MSCCL++ Customization Algorithm Selection
+:align: center
+:width: 400px
+
+MSCCL++ Customization Algorithm Selection Overview
+```
+
+## Prerequisites
+
+Before starting, ensure you have:
+
+- MSCCL++ installed with Python bindings
+- PyTorch with CUDA support
+
+**Required Environment Variables:**
+
+```bash
+export MSCCLPP_MASTER_ADDR=<master_node_ip>  # IP address of master node
+export MSCCLPP_MASTER_PORT=<port>             # Port for communication (e.g., 29500)
+```
+
+## Common Setup: Creating a Communicator
+
+All approaches require initializing an MSCCL++ communicator. Here's a reusable setup:
+
+```python
+import os
+import torch
+import netifaces as ni
+import ipaddress
+
+def get_network_interface(ip: str):
+    """Find the network interface for the given IP address."""
+    target = ipaddress.ip_address(ip)
+    for interface in ni.interfaces():
+        addresses = ni.ifaddresses(interface)
+        if ni.AF_INET in addresses:
+            for link in addresses[ni.AF_INET]:
+                if "addr" in link:
+                    if ipaddress.ip_address(link["addr"]) == target:
+                        return interface
+    return None
+
+def init_communicator():
+    """Initialize MSCCL++ communicator from environment variables."""
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ.get("LOCAL_RANK", rank))
+
+    torch.cuda.set_device(local_rank)
+
+    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
+    master_port = os.environ["MSCCLPP_MASTER_PORT"]
+    interface = get_network_interface(master_addr)
+
+    if interface is None:
+        raise ValueError(f"Cannot find network interface for IP {master_addr}")
+
+    interface_trio = f"{interface}:{master_addr}:{master_port}"
+    comm_group = mscclpp.CommGroup(
+        interfaceIpPortTrio=interface_trio,
+        rank=rank,
+        size=world_size
+    )
+    return comm_group
+```
+
+---
+
+## Approach 1: Default Built-in Algorithms (Easiest)
+
+Use pre-built, optimized algorithms from MSCCL++. Best for standard collective operations.
+
+**Example:** [customized_comm_with_default_algo.py](../../examples/torch-integration/customized_comm_with_default_algo.py)
+
+### Step 1: Load Default Algorithms
+
+```python
+import mscclpp
+import mscclpp.utils as mscclpp_utils
+
+def load_algorithms(scratch_buffer: torch.Tensor, rank: int):
+    """Load MSCCL++ default algorithm collection."""
+    collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    return collection_builder.build_default_algorithms(
+        scratch_buffer=scratch_buffer.data_ptr(),
+        scratch_buffer_size=scratch_buffer.nbytes,
+        rank=rank
+    )
+```
+
+### Step 2: Create a Custom Communicator Class
+
+```python
+class CustomizedComm:
+    def __init__(self, comm: mscclpp.CommGroup):
+        self.comm = comm
+
+        # Allocate scratch buffer (required by some algorithms)
+        dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+        self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
+
+        # Load and select algorithms
+        algorithms = load_algorithms(self.scratch_buffer, comm.my_rank)
+
+        # Select specific algorithms by name
+        self._algo_small = [
+            algo for algo in algorithms
+            if algo.collective == "allreduce"
+            and algo.name == "default_allreduce_nvls_packet"
+        ][0]
+
+        self._algo_large = [
+            algo for algo in algorithms
+            if algo.collective == "allreduce"
+            and algo.name == "default_allreduce_nvls_with_copy"
+        ][0]
+
+    def all_reduce(self, tensor: torch.Tensor, stream=None):
+        # Select algorithm based on message size
+        algo = self._algo_small if tensor.nbytes < (1 << 20) else self._algo_large
+
+        algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            op=mscclpp.ReduceOp.SUM,
+            stream=stream.cuda_stream if stream else 0
+        )
+```
+
+### Step 3: Run
+
+```bash
+MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
+  torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_default_algo.py
+```
+
+---
+
+## Approach 2: DSL-based Algorithms (Medium)
+
+Define custom communication patterns using MSCCL++ Python DSL. No C++ required.
+
+**Example:** [customized_comm_with_dsl.py](../../examples/torch-integration/customized_comm_with_dsl.py)
+
+### Step 1: Define the Collective Program
+
+```python
+import mscclpp
+from mscclpp.language.collectives import AllReduce
+from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
+from mscclpp.language.program import CollectiveProgram
+from mscclpp.language.rank import Rank
+
+def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+    """Define an allreduce using NVLS (NVLink SHARP)."""
+    gpu_size = spec.world_size
+
+    with CollectiveProgram(
+        spec.name,
+        spec.collective,
+        gpu_size,
+        instances=spec.instances,
+        protocol=spec.protocol,
+        num_threads_per_block=spec.num_threads_per_block,
+        min_message_size=spec.min_message_size,
+        max_message_size=spec.max_message_size,
+    ) as program:
+        # Create NVLS channel for all GPUs
+        nvls_chan = SwitchChannel(
+            rank_list=[gpu for gpu in range(gpu_size)],
+            buffer_type=BufferType.input
+        )
+
+        # Create memory channels for synchronization
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronize before operation
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Perform reduce and broadcast
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            input_buffer = rank.get_input_buffer()
+            nvls_chan.at_rank(gpu).reduce(
+                buffer_offset=gpu, size=1,
+                dst_chunk=input_buffer[gpu:gpu+1], tb=0
+            )
+            nvls_chan.at_rank(gpu).broadcast(
+                src_chunk=input_buffer[gpu:gpu+1],
+                buffer_offset=gpu, size=1, tb=0
+            )
+
+        # Synchronize after operation
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)].wait(tb=0, relaxed=True)
+
+    return program
+```
+
+### Step 2: Compile the Algorithm
+
+```python
+def setup_algorithm(rank: int, world_size: int, nranks_per_node: int):
+    """Compile the DSL algorithm for this rank."""
+    spec = mscclpp.language.AlgoSpec(
+        name="allreduce_nvls",
+        collective=AllReduce(world_size, 1, True),
+        nranks_per_node=nranks_per_node,
+        world_size=world_size,
+        in_place=True,
+        instances=nranks_per_node,
+        protocol="Simple",
+        num_threads_per_block=1024,
+        min_message_size=1 << 20,
+        max_message_size=48 << 30,
+        tags={"nvls": 1},
+    )
+    return mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
+```
+
+### Step 3: Execute with Executor
+
+DSL algorithms require an `Executor`:
+
+```python
+class CustomizedComm:
+    def __init__(self, comm: mscclpp.CommGroup, algorithm):
+        self.comm = comm
+        self.executor = mscclpp.Executor(comm.communicator)  # Required for DSL
+        self.algorithm = algorithm
+
+    def all_reduce(self, tensor: torch.Tensor, stream=None):
+        self.algorithm.execute(
+            comm=self.comm.communicator,
+            executor=self.executor,  # Pass executor for DSL algorithms
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            stream=stream.cuda_stream if stream else 0
+        )
+```
+
+### Step 4: Run
+
+```bash
+MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
+  torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_dsl.py
+```
+
+---
+
+## Approach 3: Native C++/CUDA Kernels (Advanced)
+
+Write custom CUDA kernels for maximum performance and control.
+
+**Example:** [customized_allgather.py](../../examples/torch-integration/customized_allgather.py) + [customized_allgather.cu](../../examples/torch-integration/customized_allgather.cu)
+
+### Step 1: Implement the CUDA Kernel and Algorithm Builder
+
+Create a `.cu` file with your kernel and algorithm builder:
+
+```cpp
+// customized_allgather.cu
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/core.hpp>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+// Your CUDA kernel
+__global__ void allgather(
+    mscclpp::DeviceHandle<mscclpp::PortChannel>* channels,
+    int rank,
+    size_t nbytesPerGPU
+) {
+    // Kernel implementation...
+}
+
+// Algorithm builder class
+class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
+public:
+    std::shared_ptr<mscclpp::Algorithm> build() override {
+        auto self = std::make_shared<AllgatherAlgoBuilder>();
+        return std::make_shared<mscclpp::NativeAlgorithm>(
+            "allgather",    // Algorithm name
+            "allgather",    // Collective type
+            // Initialize function
+            [self](std::shared_ptr<mscclpp::Communicator> comm) {
+                self->initialize(comm);
+            },
+            // Kernel execution function
+            [self](const std::shared_ptr<void> ctx,
+                   const void* input, void* output,
+                   size_t inputSize, size_t outputSize,
+                   mscclpp::DataType dtype, mscclpp::ReduceOp op,
+                   cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                   const std::unordered_map<std::string, uintptr_t>& extras) {
+                return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
+            },
+            // Context initialization function
+            [self](std::shared_ptr<mscclpp::Communicator> comm,
+                   const void* input, void* output,
+                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+                return self->initContext(comm, input, output, inputSize, dtype);
+            },
+            // Context key generation function
+            [self](const void* input, void* output,
+                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+                return self->generateContextKey(input, output, inputSize, outputSize, dtype);
+            }
+        );
+    }
+
+private:
+    void initialize(std::shared_ptr<mscclpp::Communicator> comm) { /* ... */ }
+    mscclpp::CommResult kernelFunc(const std::shared_ptr<void> ctx, /* ... */) { /* ... */ }
+    std::shared_ptr<void> initContext(/* ... */) { /* ... */ }
+    mscclpp::AlgorithmCtxKey generateContextKey(/* ... */) { /* ... */ }
+};
+
+// Expose to Python
+PYBIND11_MODULE(mscclpp_native, m) {
+    m.def("create_allgather_algorithm", []() {
+        auto builder = std::make_shared<AllgatherAlgoBuilder>();
+        auto algo = builder->build();
+        // Return as PyCapsule (see full example for capsule handling)
+        return py::reinterpret_steal<py::capsule>(getCapsule(algo));
+    });
+}
+```
+
+### Step 2: Compile and Load in Python
+
+```python
+import mscclpp
+import os
+
+# MSCCL++ compiles the .cu file at runtime using JIT
+mscclpp_native = mscclpp.compile_native(
+    name="mscclpp_native",
+    file=os.path.join(os.path.dirname(__file__), "customized_allgather.cu")
+)
+
+# Get the algorithm from the compiled module
+capsule = mscclpp_native.create_allgather_algorithm()
+algorithm = mscclpp.Algorithm.create_from_native_capsule(capsule)
+```
+
+### Step 3: Execute
+
+```python
+class CustomizedComm:
+    def __init__(self, comm: mscclpp.CommGroup):
+        self.comm = comm
+
+        # Compile and load native algorithm
+        mscclpp_native = mscclpp.compile_native(
+            name="mscclpp_native",
+            file="customized_allgather.cu"
+        )
+        capsule = mscclpp_native.create_allgather_algorithm()
+        self.algorithm = mscclpp.Algorithm.create_from_native_capsule(capsule)
+
+    def all_gather(self, tensor: torch.Tensor, out_tensor: torch.Tensor, stream=None):
+        self.algorithm.execute(
+            self.comm.communicator,
+            tensor.data_ptr(),
+            out_tensor.data_ptr(),
+            tensor.nbytes,
+            out_tensor.nbytes,
+            mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            stream=stream.cuda_stream if stream else 0
+        )
+```
+
+### Step 4: Run
+
+```bash
+MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
+  torchrun --nnodes=1 --nproc_per_node=8 customized_allgather.py
+```
+
+---
+
+## Algorithm.execute() API Reference
+
+All algorithms use the same `execute()` interface:
+
+```python
+algorithm.execute(
+    comm,                    # mscclpp.Communicator (required)
+    input_buffer,            # int: input data pointer from tensor.data_ptr()
+    output_buffer,           # int: output data pointer
+    input_size,              # int: input size in bytes (tensor.nbytes)
+    output_size,             # int: output size in bytes
+    dtype,                   # mscclpp.DataType: data type
+    op=mscclpp.ReduceOp.NOP, # Reduction operation (for reduce collectives)
+    stream=0,                # CUDA stream handle
+    executor=None,           # mscclpp.Executor (required for DSL algorithms)
+    nblocks=0,               # Thread blocks (0 = auto)
+    nthreads_per_block=0,    # Threads per block (0 = auto)
+    extras=None              # dict[str, int]: extra pointer parameters
+)
+```
+
+**Data Type Conversion:**
+
+```python
+import mscclpp.utils as mscclpp_utils
+
+# Convert PyTorch dtype to MSCCL++ dtype
+mscclpp_dtype = mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype)
+```
+
+**Getting CUDA Stream:**
+
+```python
+stream_handle = torch.cuda.current_stream().cuda_stream
+```
+
+---
+
+## Summary
+
+| Approach | When to Use | Key Requirement |
+|----------|-------------|-----------------|
+| Default | Standard operations, quick setup | Scratch buffer |
+| DSL | Custom patterns, no C++ needed | Executor |
+| Native | Custom kernels, max performance | `.cu` file with pybind11 |
+
+All examples are in [`examples/torch-integration/`](../../examples/torch-integration/).
+
--- a/docs/programming_guide.rst
+++ b/docs/programming_guide.rst
@@ -12,3 +12,5 @@ This section provides advanced topics and best practices for using MSCCL++. It i
   guide/advanced-connections
   guide/cpp-examples
   guide/customized-algorithm-with-nccl-api
+   guide/mscclpp-dsl-integration
+   guide/mscclpp-torch-integration
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -132,20 +132,20 @@ For more details on how to use the Dev Container, see the [Dev Containers tutori

 ```bash
 $ make -j unit_tests
-$ ./test/unit_tests
+$ ./bin/unit_tests
 ```

 For thorough testing of MSCCL++ features, we need to use `mp_unit_tests` that require at least two GPUs on the system. `mp_unit_tests` also requires MPI to be installed on the system. For example, the following commands compile and run `mp_unit_tests` with two processes (two GPUs). The number of GPUs can be changed by changing the number of processes.

 ```bash
 $ make -j mp_unit_tests
-$ mpirun -np 2 ./test/mp_unit_tests
+$ mpirun -np 2 ./bin/mp_unit_tests
 ```

 To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_port` argument that is accessible from all nodes. For example:

 ```bash
-$ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000
+$ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0.0.5:50000
 ```

 ## Performance Benchmark
@@ -166,12 +166,13 @@ $ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
 We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) APIs using MSCCL++. How to use:

 1. [Build MSCCL++ from source](#install-from-source).
-2. Replace your `libnccl.so` library with `libmscclpp_nccl.so`, which is compiled under `./build/apps/nccl/` directory.
+2. Replace your `libnccl.so` library with `libmscclpp_nccl.so`, which is compiled under `./build/lib/` directory.

 For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.

 ```bash
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

 If MSCCL++ is built on AMD platforms, `libmscclpp_nccl.so` would replace the [RCCL](https://github.com/ROCm/rccl) library (i.e., `librccl.so`).
@@ -188,12 +189,14 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec

 Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
 ```bash
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

 On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features.
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,6 +1,7 @@
 sphinx==8.2.3
 breathe==4.36.0
 myst_parser==4.0.1
+pybind11
 sphinx_rtd_theme
 sphinxcontrib-mermaid
 sphinx-autodoc-typehints
--- a/docs/tutorials/python-api.md
+++ b/docs/tutorials/python-api.md
@@ -15,9 +15,9 @@ from mscclpp import (
    Transport,
 )
 from mscclpp.utils import GpuBuffer
-import mscclpp.comm as mscclpp_comm

-def create_connection(group: mscclpp_comm.CommGroup, transport: str):
+
+def create_connection(group: mscclpp.CommGroup, transport: str):
    remote_nghrs = list(range(group.nranks))
    remote_nghrs.remove(group.my_rank)
    if transport == "NVLink":
@@ -30,7 +30,7 @@ def create_connection(group: mscclpp_comm.CommGroup, transport: str):
    return connections

 if __name__ == "__main__":
-    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    mscclpp_group = mscclpp.CommGroup(MPI.COMM_WORLD)
    connections = create_connection(mscclpp_group, "NVLink")
    nelems = 1024
    memory = GpuBuffer(nelem, dtype=cp.int32)
--- a/examples/customized-collective-algorithm/Makefile
+++ b/examples/customized-collective-algorithm/Makefile
@@ -16,7 +16,7 @@ SRC = customized_allgather.cu
 all: $(TARGET)

 $(TARGET): $(SRC)
-	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp -lnccl
+	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp_collectives -lmscclpp -lnccl

 clean:
 	rm -f $(TARGET)
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.

-#include <mscclpp/nccl.h>
+#include <mscclpp/ext/nccl/nccl.h>
 #include <sys/wait.h>

 #include <filesystem>
@@ -10,6 +10,7 @@
 #include <memory>
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
+#include <mscclpp/ext/collectives/algorithm_collection_builder.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <unordered_map>

@@ -76,6 +77,15 @@ __global__ void __launch_bounds__(1024)
  }
 }

+struct Context {
+  int rank;
+  int workSize;
+  int nRanksPerNode;
+
+  std::vector<mscclpp::RegisteredMemory> registeredMemories;
+  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannelDeviceHandles;
+};
+
 class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
 public:
  AllgatherAlgoBuilder() = default;
@@ -85,23 +95,21 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
    }
  }

-  mscclpp::Algorithm build() {
+  std::shared_ptr<mscclpp::Algorithm> build() override {
    auto self = std::make_shared<AllgatherAlgoBuilder>();
-    mscclpp::Algorithm allgatherAlgo(
-        "allgather", "allgather",
-        [self](std::shared_ptr<mscclpp::Communicator> comm, std::unordered_map<std::string, std::shared_ptr<void>>&) {
-          self->initialize(comm);
+    std::shared_ptr<mscclpp::Algorithm> allgatherAlgo = std::make_shared<mscclpp::NativeAlgorithm>(
+        "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+        [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
+               mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+          return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
        },
-        [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
-               int dtype, cudaStream_t stream, std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-          return self->allgatherKernelFunc(ctx, input, output, count, static_cast<ncclDataType_t>(dtype), stream,
-                                           extras);
-        },
-        [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count, int dtype) {
-          return self->initAllgatherContext(comm, input, output, count, static_cast<ncclDataType_t>(dtype));
-        },
-        [self](const void* input, void* output, size_t count, int dtype) {
-          return self->generateAllgatherContextKey(input, output, count, static_cast<ncclDataType_t>(dtype));
+        [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
+               size_t outputSize,
+               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+          return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
+                                                   static_cast<ncclDataType_t>(dtype));
        });
    return allgatherAlgo;
  }
@@ -126,34 +134,32 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
    proxyService_->startProxy();
  }

-  ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
-                                   size_t count, [[maybe_unused]] ncclDataType_t dtype, cudaStream_t stream,
-                                   std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-    int rank = ctx->rank;
-    int worldSize = ctx->workSize;
+  mscclpp::CommResult allgatherKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
+                                          size_t inputSize, cudaStream_t stream) {
+    auto algoCtx = std::static_pointer_cast<Context>(ctx);
+    int rank = algoCtx->rank;
+    int worldSize = algoCtx->workSize;

    int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
-    allgather<<<1, nThreadsPerBlock, 0, stream>>>(ctx->portChannelDeviceHandles.get(), rank,
-                                                  count * ncclTypeSize(dtype));
+    allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputSize);
    if (cudaGetLastError() == cudaSuccess) {
-      return ncclSuccess;
+      return mscclpp::CommResult::CommSuccess;
    }
-    return ncclInternalError;
+    return mscclpp::CommResult::CommInternalError;
  }

-  std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
-                                                              const void* input, void* output, size_t count,
-                                                              ncclDataType_t dtype) {
-    auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
+  std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void* input,
+                                             void* output, size_t inputSize, mscclpp::DataType dtype) {
+    auto ctx = std::make_shared<Context>();
    ctx->rank = comm->bootstrap()->getRank();
    ctx->workSize = comm->bootstrap()->getNranks();
    ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();

    // register memories
    mscclpp::RegisteredMemory inputBufRegMem =
-        comm->registerMemory((void*)input, count * ncclTypeSize(dtype), mscclpp::Transport::CudaIpc);
+        comm->registerMemory((void*)input, inputSize, mscclpp::Transport::CudaIpc);
    mscclpp::RegisteredMemory outputBufRegMem =
-        comm->registerMemory(output, count * ncclTypeSize(dtype) * ctx->workSize, mscclpp::Transport::CudaIpc);
+        comm->registerMemory(output, inputSize * ctx->workSize, mscclpp::Transport::CudaIpc);
    std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
    for (int i = 0; i < ctx->workSize; i++) {
      if (i == ctx->rank) continue;
@@ -184,9 +190,9 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
    return ctx;
  }

-  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t count,
-                                                       ncclDataType_t dtype) {
-    return {(void*)input, output, count * ncclTypeSize(dtype), count * ncclTypeSize(dtype) * worldSize_, 0};
+  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
+                                                       size_t outputSize, ncclDataType_t dtype) {
+    return {(void*)input, output, inputSize, outputSize, 0};
  }
 };

@@ -197,16 +203,16 @@ void worker(int rank, int worldSize, ncclUniqueId id) {

  // register algorithm
  auto allgatherAlgoBuilder = std::make_shared<AllgatherAlgoBuilder>();
-  mscclpp::AlgorithmCollectionBuilder::getInstance()->addAlgorithmBuilder(allgatherAlgoBuilder);
-  mscclpp::AlgorithmCollectionBuilder::getInstance()->setAlgorithmSelector(
-      [](const std::unordered_map<std::string, std::unordered_map<std::string, mscclpp::Algorithm>>&
+  auto algoCollectionBuilder = mscclpp::collective::AlgorithmCollectionBuilder::getInstance();
+  algoCollectionBuilder->addAlgorithmBuilder(allgatherAlgoBuilder);
+  algoCollectionBuilder->setAlgorithmSelector(
+      [](const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<mscclpp::Algorithm>>>&
             algoMapByCollective,
-         std::string collective, const void* input, void* output, size_t messageSize, int dtype, int nRanksPerNode,
-         int worldSize) {
-        if (collective != "allgather") {
-          return mscclpp::Algorithm();
+         const mscclpp::CollectiveRequest& request) -> std::shared_ptr<mscclpp::Algorithm> {
+        if (request.collective != "allgather") {
+          return nullptr;
        }
-        return algoMapByCollective.at(collective).at("allgather");
+        return algoMapByCollective.at(request.collective).at("allgather");
      });

  float *sendbuff, *recvbuff;
@@ -259,6 +265,7 @@ void worker(int rank, int worldSize, ncclUniqueId id) {
  MSCCLPP_CUDATHROW(cudaFree(recvbuff));

  ncclCommDestroy(comm);
+  algoCollectionBuilder->reset();
 }

 int main() {
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -0,0 +1,199 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <Python.h>
+#include <mscclpp/ext/nccl/nccl.h>
+#include <pybind11/pybind11.h>
+
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/core.hpp>
+#include <mscclpp/port_channel.hpp>
+#include <mscclpp/port_channel_device.hpp>
+
+namespace py = pybind11;
+
+#if defined(__HIP_PLATFORM_AMD__)
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+
+__global__ void __launch_bounds__(1024)
+    allgather(mscclpp::DeviceHandle<mscclpp::PortChannel>* portChannels, int rank, size_t nbytesPerGPU) {
+  int warpId = threadIdx.x / WARP_SIZE;
+
+  // Each warp is responsible for one of the remote ranks
+  mscclpp::DeviceHandle<mscclpp::PortChannel> portChan = portChannels[warpId];
+
+  // this allgather is really simple and implemented as an alltoall
+
+  // this thread's role is a sender role
+  // put your data asynchronously
+  if (threadIdx.x % WARP_SIZE == 0) {
+    portChan.putWithSignal(rank * nbytesPerGPU, 0, nbytesPerGPU);
+  }
+  // make sure everyone is put their data before some thread randomly blocks everyone else in signal
+  __syncthreads();
+  // push with flag and sync to make sure the data is received
+  if (threadIdx.x % WARP_SIZE == 0) {
+    portChan.flush();
+  }
+
+  // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
+  if (threadIdx.x % WARP_SIZE == 0) {
+    portChan.wait();
+  }
+}
+
+struct Context {
+  int rank;
+  int workSize;
+  int nRanksPerNode;
+
+  std::vector<mscclpp::RegisteredMemory> registeredMemories;
+  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannelDeviceHandles;
+};
+
+class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
+ public:
+  AllgatherAlgoBuilder() = default;
+  ~AllgatherAlgoBuilder() {
+    if (proxyService_) {
+      proxyService_->stopProxy();
+    }
+  }
+
+  std::shared_ptr<mscclpp::Algorithm> build() override {
+    auto self = std::make_shared<AllgatherAlgoBuilder>();
+    std::shared_ptr<mscclpp::Algorithm> allgatherAlgo = std::make_shared<mscclpp::NativeAlgorithm>(
+        "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+        [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
+               mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+          return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
+        },
+        [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
+               size_t outputSize,
+               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
+        });
+    return allgatherAlgo;
+  }
+
+ private:
+  std::vector<mscclpp::Connection> conns_;
+  std::shared_ptr<mscclpp::ProxyService> proxyService_;
+  int worldSize_;
+
+  void initialize(std::shared_ptr<mscclpp::Communicator> comm) {
+    std::vector<std::shared_future<mscclpp::Connection>> connectionFutures;
+    worldSize_ = comm->bootstrap()->getNranks();
+    for (int i = 0; i < worldSize_; i++) {
+      if (i == comm->bootstrap()->getRank()) continue;
+      connectionFutures.push_back(comm->connect(mscclpp::Transport::CudaIpc, i));
+    }
+    std::vector<mscclpp::Connection> connections;
+    std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
+                   [](const auto& future) { return future.get(); });
+    this->conns_ = std::move(connections);
+    proxyService_ = std::make_shared<mscclpp::ProxyService>();
+    proxyService_->startProxy(true);
+  }
+
+  mscclpp::CommResult allgatherKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
+                                          size_t inputBytes, [[maybe_unused]] mscclpp::DataType dtype,
+                                          cudaStream_t stream) {
+    auto algoCtx = std::static_pointer_cast<Context>(ctx);
+    int rank = algoCtx->rank;
+    int worldSize = algoCtx->workSize;
+
+    int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
+    allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputBytes);
+    if (cudaGetLastError() == cudaSuccess) {
+      return mscclpp::CommResult::CommSuccess;
+    }
+    return mscclpp::CommResult::CommInternalError;
+  }
+
+  std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void* input,
+                                             void* output, size_t inputBytes, mscclpp::DataType dtype) {
+    auto ctx = std::make_shared<Context>();
+    ctx->rank = comm->bootstrap()->getRank();
+    ctx->workSize = comm->bootstrap()->getNranks();
+    ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+    // register memories
+    mscclpp::RegisteredMemory inputBufRegMem =
+        comm->registerMemory((void*)input, inputBytes, mscclpp::Transport::CudaIpc);
+    mscclpp::RegisteredMemory outputBufRegMem =
+        comm->registerMemory(output, inputBytes * ctx->workSize, mscclpp::Transport::CudaIpc);
+    std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
+    for (int i = 0; i < ctx->workSize; i++) {
+      if (i == ctx->rank) continue;
+      comm->sendMemory(outputBufRegMem, i, 0);
+      remoteRegMemories.push_back(comm->recvMemory(i, 0));
+    }
+
+    // setup channels
+    std::vector<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannels;
+    mscclpp::MemoryId inputMemoryId = this->proxyService_->addMemory(inputBufRegMem);
+    for (int i = 0; i < this->conns_.size(); i++) {
+      auto remoteMemory = remoteRegMemories[i].get();
+      mscclpp::MemoryId remoteMemoryId = this->proxyService_->addMemory(remoteMemory);
+      portChannels.push_back(mscclpp::deviceHandle(this->proxyService_->portChannel(
+          this->proxyService_->buildAndAddSemaphore(*comm, this->conns_[i]), remoteMemoryId, inputMemoryId)));
+    }
+    ctx->portChannelDeviceHandles =
+        mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::PortChannel>>(portChannels.size());
+    mscclpp::gpuMemcpy(ctx->portChannelDeviceHandles.get(), portChannels.data(), portChannels.size(),
+                       cudaMemcpyHostToDevice);
+
+    // keep registered memory references
+    std::transform(remoteRegMemories.begin(), remoteRegMemories.end(), std::back_inserter(ctx->registeredMemories),
+                   [](const auto& fut) { return fut.get(); });
+    ctx->registeredMemories.push_back(inputBufRegMem);
+    ctx->registeredMemories.push_back(outputBufRegMem);
+
+    return ctx;
+  }
+
+  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
+                                                       size_t outputSize, mscclpp::DataType dtype) {
+    return {(void*)input, output, inputSize, outputSize, 0};
+  }
+};
+
+std::shared_ptr<mscclpp::Algorithm> createAllgatherAlgorithm() {
+  auto allgatherAlgoBuilder = std::make_shared<AllgatherAlgoBuilder>();
+  return allgatherAlgoBuilder->build();
+}
+
+void deletePtr(PyObject* capsule) {
+  const char* name = PyCapsule_GetName(capsule);
+  void* p = PyCapsule_GetPointer(capsule, name);
+  if (p == nullptr) {
+    PyErr_WriteUnraisable(capsule);
+    return;
+  }
+  auto* ptr = static_cast<std::shared_ptr<mscclpp::Algorithm>*>(p);
+  delete ptr;
+}
+
+PyObject* getCapsule(std::shared_ptr<mscclpp::Algorithm> algo) {
+  auto* ptrCopy = new std::shared_ptr<mscclpp::Algorithm>(algo);
+  PyObject* capsule = PyCapsule_New(ptrCopy, mscclpp::ALGORITHM_NATIVE_CAPSULE_NAME, deletePtr);
+  if (capsule == nullptr) {
+    delete ptrCopy;
+    throw pybind11::error_already_set();
+  }
+  return capsule;
+}
+
+PYBIND11_MODULE(mscclpp_native, m) {
+  m.doc() = "A simple C++ extension for mscclpp customized algorithm";
+  m.def(
+      "create_allgather_algorithm",
+      []() { return py::reinterpret_steal<py::capsule>(getCapsule(createAllgatherAlgorithm())); },
+      "Create an allgather algorithm and return it as a PyCapsule usable by MSCCL++ Python bindings");
+}
--- a/examples/torch-integration/customized_allgather.py
+++ b/examples/torch-integration/customized_allgather.py
@@ -0,0 +1,84 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_allgather.py
+# For AMD: MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> GPU_MAX_HW_QUEUES=7 torchrun --nnodes=1 --nproc_per_node=8 customized_allgather.py
+
+import mscclpp
+
+import mscclpp.utils as mscclpp_utils
+import torch
+import os
+import netifaces as ni
+import ipaddress
+
+_abs_path = os.path.dirname(os.path.abspath(__file__))
+
+
+def interfaces_for_ip_netifaces(ip: str):
+    target = ipaddress.ip_address(ip)
+    for interface in ni.interfaces():
+        addresses = ni.ifaddresses(interface)
+        if ni.AF_INET in addresses:
+            for link in addresses[ni.AF_INET]:
+                if "addr" in link:
+                    addr = ipaddress.ip_address(link["addr"])
+                    if addr == target:
+                        return interface
+    return None
+
+
+class CustomizedComm:
+    def __init__(self, comm: mscclpp.CommGroup):
+        self.comm = comm
+        self.rank = comm.my_rank
+        self.world_size = comm.nranks
+        self.local_rank = comm.my_rank % comm.nranks_per_node
+        self.n_ranks_per_node = comm.nranks_per_node
+        self.executor = mscclpp.Executor(comm.communicator)
+        mscclpp_native = mscclpp.compile_native(
+            name="mscclpp_native", file=os.path.join(_abs_path, "customized_allgather.cu")
+        )
+        capsule = mscclpp_native.create_allgather_algorithm()
+        self.algorithm = mscclpp.Algorithm.create_from_native_capsule(capsule)
+
+    def all_gather(self, tensor: torch.Tensor, out_tensor: torch.Tensor, stream: torch.cuda.Stream = None):
+        self.algorithm.execute(
+            self.comm.communicator,
+            tensor.data_ptr(),
+            out_tensor.data_ptr(),
+            tensor.nbytes,
+            out_tensor.nbytes,
+            mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            stream=stream.cuda_stream if stream is not None else 0,
+        )
+
+    def barrier_cpu(self):
+        self.comm.barrier()
+
+
+def main():
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ.get("LOCAL_RANK", os.environ["RANK"]))
+    torch.cuda.set_device(local_rank)
+    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
+    master_port = os.environ["MSCCLPP_MASTER_PORT"]
+    interface = interfaces_for_ip_netifaces(master_addr)
+    if interface is None:
+        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
+    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
+    mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world_size)
+    local_tensor_size = 1 << 20
+    out_tensor = torch.randn(local_tensor_size * world_size, device="cuda", dtype=torch.float32)
+    tensor = out_tensor[rank * local_tensor_size : (rank + 1) * local_tensor_size]
+    comm = CustomizedComm(mscclpp_group)
+    comm.barrier_cpu()
+    comm.all_gather(tensor, out_tensor, stream=torch.cuda.current_stream())
+    torch.cuda.synchronize()
+    comm = None
+    print(f"Rank {rank} allgather completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/torch-integration/customized_comm_with_default_algo.py
+++ b/examples/torch-integration/customized_comm_with_default_algo.py
@@ -0,0 +1,120 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm_with_default_algo.py
+
+import os
+import torch
+import mscclpp.utils as mscclpp_utils
+import mscclpp
+import mscclpp.ext
+import netifaces as ni
+import ipaddress
+
+
+def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
+    collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
+    return collection_builder.build_default_algorithms(
+        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+    )
+
+
+def interfaces_for_ip_netifaces(ip: str):
+    target = ipaddress.ip_address(ip)
+    for interface in ni.interfaces():
+        addresses = ni.ifaddresses(interface)
+        if ni.AF_INET in addresses:
+            for link in addresses[ni.AF_INET]:
+                if "addr" in link:
+                    addr = ipaddress.ip_address(link["addr"])
+                    if addr == target:
+                        return interface
+    return None
+
+
+def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
+    if op == torch.distributed.ReduceOp.SUM:
+        return mscclpp.ReduceOp.SUM
+    elif op == torch.distributed.ReduceOp.MIN:
+        return mscclpp.ReduceOp.MIN
+    else:
+        raise ValueError(f"unsupported op: {op}")
+
+
+class CustomizedComm:
+    def __init__(self, comm: mscclpp.CommGroup):
+        self.comm = comm
+        self.rank = comm.my_rank
+        self.world_size = comm.nranks
+        self.local_rank = comm.my_rank % comm.nranks_per_node
+        self.n_ranks_per_node = comm.nranks_per_node
+        dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+        self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
+        algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
+        self._algorithm_nvls_packet = [
+            algo
+            for algo in algorithms
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
+        ][0]
+        self._algorithm_nvls_nonzero_copy = [
+            algo
+            for algo in algorithms
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
+        ][0]
+
+    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
+        assert op == torch.distributed.ReduceOp.SUM
+        algo = None
+        if tensor.nbytes < 1 << 20:
+            algo = self._algorithm_nvls_packet
+        else:
+            algo = self._algorithm_nvls_nonzero_copy
+        algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            op=to_mscclpp_reduce_op(op),
+            stream=stream.cuda_stream if stream is not None else 0,
+        )
+
+    def barrier(self):
+        tensor = torch.empty(1, dtype=torch.float, device=torch.device("cuda"))
+        self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
+
+    def destroy(self):
+        self._algorithm_nvls_nonzero_copy = None
+        self._algorithm_nvls_packet = None
+        self.scratch_buffer = None
+        self.comm = None
+
+
+def init_dist() -> CustomizedComm:
+    rank = int(os.environ["RANK"])
+    world = int(os.environ["WORLD_SIZE"])
+    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
+    master_port = os.environ["MSCCLPP_MASTER_PORT"]
+    interface = interfaces_for_ip_netifaces(master_addr)
+    if interface is None:
+        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
+    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
+    mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
+    return CustomizedComm(mscclpp_group)
+
+
+def main():
+    local = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local)
+    comm = init_dist()
+    comm.barrier()
+    input_data = torch.randn(1 << 22, dtype=torch.float16, device=torch.device("cuda"))
+    comm.all_reduce(input_data, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
+    comm.barrier()
+    comm.destroy()
+    print(f"rank {local} All-reduce operation completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/torch-integration/customized_comm_with_dsl.py
+++ b/examples/torch-integration/customized_comm_with_dsl.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

-# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm.py
+# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm_with_dsl.py

 import os
 import torch
-import mscclpp.comm as mscclpp_comm
 import mscclpp
+import mscclpp.language
 from mscclpp.language.collectives import AllReduce
 from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
 from mscclpp.language.program import CollectiveProgram
@@ -15,13 +15,13 @@ import netifaces as ni
 import ipaddress


-def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+def allreduce_nvls(spec: mscclpp.language.AlgoSpec) -> CollectiveProgram:
    gpu_size = spec.world_size
    with CollectiveProgram(
        spec.name,
        spec.collective,
        gpu_size,
-        instances=8,
+        instances=spec.instances,
        protocol=spec.protocol,
        num_threads_per_block=spec.num_threads_per_block,
        min_message_size=spec.min_message_size,
@@ -74,14 +74,14 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
    return program


-def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: int):
-    spec = mscclpp.AlgoSpec(
+def setup_plan(rank: int, world_size: int, nranks_per_node: int):
+    spec = mscclpp.language.AlgoSpec(
        name="allreduce_nvls",
-        collective=AllReduce(8, 1, True),
-        nranks_per_node=8,
+        collective=AllReduce(world_size, 1, True),
+        nranks_per_node=nranks_per_node,
        world_size=world_size,
        in_place=True,
-        instances=2,
+        instances=nranks_per_node,
        protocol="Simple",
        num_threads_per_block=1024,
        min_message_size=1 << 20,
@@ -89,17 +89,10 @@ def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: i
        tags={"nvls": 1},
    )

-    plan_handle = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
-    registry.register_plan(plan_handle)
-
-
-def selector(plans, req):
-    if req.collective != "allreduce":
-        return None
-    if req.message_size < 1 << 20:
-        return None
-    nvls = [p for p in plans if "nvls" in p.tags]
-    return nvls[0] if nvls else plans[0]
+    algorithms = []
+    algo = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
+    algorithms.append(algo)
+    return algorithms


 def interfaces_for_ip_netifaces(ip: str):
@@ -129,43 +122,37 @@ def dtype_to_mscclpp_dtype(dtype: torch.dtype) -> mscclpp.DataType:


 class CustomizedComm:
-    def __init__(self, comm: mscclpp_comm.CommGroup):
+    def __init__(self, comm: mscclpp.CommGroup, algorithms=[]):
        self.comm = comm
        self.rank = comm.my_rank
        self.world_size = comm.nranks
        self.local_rank = comm.my_rank % comm.nranks_per_node
        self.n_ranks_per_node = comm.nranks_per_node
-        self.registry = mscclpp.ExecutionPlanRegistry()
        self.executor = mscclpp.Executor(comm.communicator)
+        self.algorithms = algorithms

    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
        assert op == torch.distributed.ReduceOp.SUM
-        plan = self.registry.select(
-            collective="allreduce",
-            world_size=self.world_size,
-            n_ranks_per_node=self.n_ranks_per_node,
-            send_buffer=tensor.data_ptr(),
-            recv_buffer=tensor.data_ptr(),
-            message_size=tensor.numel() * tensor.element_size(),
-        )
-        if plan is None:
-            raise ValueError(
-                f"No suitable plan found for collective allreduce with message size {tensor.numel() * tensor.element_size()}"
-            )
-        self.executor.execute(
-            self.rank,
-            tensor.data_ptr(),
-            tensor.data_ptr(),
-            tensor.numel() * tensor.element_size(),
-            tensor.numel() * tensor.element_size(),
-            dtype_to_mscclpp_dtype(tensor.dtype),
-            plan.plan,
-            stream.cuda_stream if stream is not None else 0,
+        algo: mscclpp.Algorithm = self.algorithms[0]
+        algo.execute(
+            comm=self.comm.communicator,
+            executor=self.executor,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=dtype_to_mscclpp_dtype(tensor.dtype),
+            stream=stream.cuda_stream if stream is not None else 0,
        )

    def barrier_cpu(self):
        self.comm.barrier()

+    def destroy(self):
+        self.algorithms = None
+        self.executor = None
+        self.comm = None
+

 def init_dist() -> CustomizedComm:
    rank = int(os.environ["RANK"])
@@ -175,12 +162,11 @@ def init_dist() -> CustomizedComm:
    interface = interfaces_for_ip_netifaces(master_addr)
    if interface is None:
        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
-    registry = mscclpp.ExecutionPlanRegistry()
-    setup_plan(registry, rank, world)
-    registry.set_selector(selector)
+    nranks_per_node = int(torch.cuda.device_count())
+    algorithms = setup_plan(rank, world, nranks_per_node)
    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
-    mscclpp_group = mscclpp_comm.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
-    return CustomizedComm(mscclpp_group)
+    mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
+    return CustomizedComm(mscclpp_group, algorithms)


 def main():
@@ -192,9 +178,11 @@ def main():
    dlpack = buffer.to_dlpack(data_type=str(torch.bfloat16))
    x = torch.utils.dlpack.from_dlpack(dlpack)
    x.normal_()
-    comm.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+    comm.all_reduce(x, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
+    torch.cuda.synchronize()
    comm.barrier_cpu()
-    comm = None
+    print(f"Rank {comm.rank} allreduce completed successfully.")
+    comm.destroy()


 if __name__ == "__main__":
--- a/examples/torch-integration/dsl_with_nccl_api.py
+++ b/examples/torch-integration/dsl_with_nccl_api.py
@@ -1,9 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

-# LD_PRELOAD=<MSCCLPP_REPO>/build/apps/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl-torch-integration/dsl_with_nccl_api.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py

 import os
+from typing import Any, Dict
 import torch, torch.distributed as dist
 import mscclpp
 from mscclpp.language.collectives import AllReduce
@@ -62,7 +63,7 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
    return program


-def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: int):
+def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
    spec = mscclpp.AlgoSpec(
        name="allreduce_nvls",
        collective=AllReduce(8, 1, True),
@@ -77,27 +78,26 @@ def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: i
        tags={"nvls": 1},
    )

-    plan_handle = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
-    registry.register_plan(plan_handle)
+    algo = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
+    algo_collection_builder.add_algorithm_builder(algo)


-def selector(plans, req):
+def selector(algorithms: Dict[str, Any], req):
    if req.collective != "allreduce":
        return None
    if req.message_size < 1 << 20:
        return None
-    nvls = [p for p in plans if "nvls" in p.tags]
-    return nvls[0] if nvls else plans[0]
+    return algorithms["allreduce"]["allreduce_nvls"]


 def init_dist():
    rank = int(os.environ["RANK"])
    world = int(os.environ["WORLD_SIZE"])
    local = int(os.environ["LOCAL_RANK"])
-    registry = mscclpp.ExecutionPlanRegistry()
-    setup_plan(registry, rank, world)
-    registry.set_selector(selector)
-    dist.init_process_group(backend="nccl")
+    algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    setup_plan(algorithm_collection_builder, rank, world)
+    algorithm_collection_builder.set_algorithm_selector(selector)
+    dist.init_process_group(backend="nccl", device_id=local)
    return rank, world, local


@@ -111,6 +111,7 @@ def main():
    dist.all_reduce(x, op=dist.ReduceOp.SUM)
    dist.barrier()
    dist.destroy_process_group()
+    print(f"Rank {local} allreduce completed successfully.")


 if __name__ == "__main__":
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -7,3 +7,6 @@ target_sources(
    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
    FILES ${HEADERS}
 )
+
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp DESTINATION include)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/version.hpp DESTINATION include/mscclpp)
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #ifndef MSCCLPP_ALGORITHM_HPP_
 #define MSCCLPP_ALGORITHM_HPP_
@@ -14,26 +14,117 @@

 namespace mscclpp {

-class AlgorithmCtx {
- public:
-  int rank;
-  int workSize;
-  int nRanksPerNode;
+/// Capsule name for native algorithm pointers used in Python bindings.
+constexpr char ALGORITHM_NATIVE_CAPSULE_NAME[] = "mscclpp::AlgorithmPtr";

-  std::vector<mscclpp::RegisteredMemory> registeredMemories;
-  std::vector<mscclpp::MemoryChannel> memoryChannels;
-  std::vector<mscclpp::SwitchChannel> switchChannels;
-  std::vector<mscclpp::PortChannel> portChannels;
-  std::vector<std::shared_ptr<mscclpp::NvlsConnection>> nvlsConnections;
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SwitchChannel>> switchChannelDeviceHandles;
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannelDeviceHandles;
-  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
-  std::vector<std::shared_ptr<mscclpp::Host2DeviceSemaphore>> hostSemaphores;
-
-  std::unordered_map<std::string, std::shared_ptr<void>> extras;
+enum class CollectiveBufferMode {
+  Any = 0,
+  InPlace,
+  OutOfPlace,
 };

+enum class AlgorithmType {
+  Native = 0,
+  DSL,
+};
+
+enum class CommResult {
+  CommSuccess = 0,
+  CommUnhandledCudaError = 1,
+  CommSystemError = 2,
+  CommInternalError = 3,
+  CommInvalidArgument = 4,
+  CommInvalidUsage = 5,
+  CommRemoteError = 6,
+  CommInProgress = 7,
+  CommNumResults = 8
+};
+
+enum ReduceOp { SUM = 0, MIN = 3, NOP = 255 };
+
+/// Base class for collective communication algorithms.
+///
+/// This abstract class defines the interface for implementing collective communication
+/// algorithms such as allreduce, allgather, and reduce-scatter. Concrete implementations
+/// can be either native C++/CUDA algorithms or DSL-defined algorithms.
+class Algorithm {
+ public:
+  struct Constraint {
+    int worldSize;
+    int nRanksPerNode;
+  };
+
+  virtual ~Algorithm() = default;
+
+  /// Get the name of the algorithm.
+  /// @return A reference to the algorithm name string.
+  virtual const std::string& name() const = 0;
+
+  /// Get the collective operation this algorithm implements.
+  /// @return A reference to the collective name (e.g., "allreduce", "allgather").
+  virtual const std::string& collective() const = 0;
+
+  /// Get the valid message size range for this algorithm.
+  /// @return A pair of (minMessageSize, maxMessageSize) in bytes.
+  virtual const std::pair<size_t, size_t>& messageRange() const = 0;
+
+  /// Get the tags associated with this algorithm.
+  /// @return An unordered map of tag names to tag values.
+  virtual const std::unordered_map<std::string, uint64_t>& tags() const = 0;
+
+  /// Get the buffer mode supported by this algorithm.
+  /// @return The CollectiveBufferMode indicating in-place, out-of-place, or any.
+  virtual const CollectiveBufferMode& bufferMode() const = 0;
+
+  /// Get the type of this algorithm.
+  /// @return AlgorithmType::Native or AlgorithmType::DSL.
+  virtual AlgorithmType type() const = 0;
+
+  /// Get the execution constraints for this algorithm.
+  /// @return The Constraint struct specifying worldSize and nRanksPerNode requirements.
+  virtual Constraint constraint() const = 0;
+
+  /// Execute the algorithm.
+  /// @param comm The communicator to use.
+  /// @param input Pointer to the input buffer.
+  /// @param output Pointer to the output buffer.
+  /// @param inputSize Size of the input buffer in bytes.
+  /// @param outputSize Size of the output buffer in bytes.
+  /// @param dtype The data type of the elements.
+  /// @param op The reduction operation (for reduce-type collectives).
+  /// @param stream The CUDA stream to execute on.
+  /// @param executor The executor for DSL algorithms (may be nullptr for native).
+  /// @param nBlocks Number of CUDA blocks (0 for auto-selection).
+  /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
+  /// @param extras Additional parameters for algorithm-specific customization.
+  /// @return The result of the operation.
+  virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+                             size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
+                             std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                             const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
+
+  /// Reset the algorithm state, clearing any cached contexts.
+  virtual void reset() = 0;
+};
+
+/// Interface for building Algorithm instances.
+///
+/// Implement this interface to create custom algorithm factories that can be
+/// registered with the AlgorithmCollectionBuilder.
+class AlgorithmBuilder {
+ public:
+  virtual ~AlgorithmBuilder() = default;
+
+  /// Build and return an Algorithm instance.
+  /// @return A shared pointer to the constructed Algorithm.
+  virtual std::shared_ptr<Algorithm> build() = 0;
+};
+
+/// Key for identifying cached AlgorithmCtx instances.
+///
+/// The context key uniquely identifies a buffer configuration, allowing
+/// the algorithm to cache and reuse contexts for repeated operations with
+/// the same buffers.
 struct AlgorithmCtxKey {
  void* baseSendBuff;
  void* baseRecvBuff;
@@ -47,42 +138,6 @@ struct AlgorithmCtxKey {
  }
 };

-class AlgorithmImpl;
-
-class Algorithm {
- public:
-  using InitFunc = std::function<void(std::shared_ptr<mscclpp::Communicator>,
-                                      std::unordered_map<std::string, std::shared_ptr<void>>&)>;
-  using KernelFunc = std::function<int(const std::shared_ptr<AlgorithmCtx>, const void*, void*, size_t, DataType,
-                                       cudaStream_t, std::unordered_map<std::string, std::shared_ptr<void>>&)>;
-  using ContextInitFunc = std::function<std::shared_ptr<AlgorithmCtx>(std::shared_ptr<mscclpp::Communicator>,
-                                                                      const void*, void*, size_t, DataType)>;
-  using ContextKeyGenFunc =
-      std::function<AlgorithmCtxKey(const void* input, void* output, size_t count, DataType dtype)>;
-  Algorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
-            ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc);
-  Algorithm() = default;
-
-  /// @brief Launch the algorithm.
-  /// @param comm The communicator.
-  /// @param input The input buffer.
-  /// @param output The output buffer.
-  /// @param count The number of elements.
-  /// @param dtype The data type.
-  /// @param stream The CUDA stream.
-  /// @details This method will call ContextKeyGenFunc to generate a context key based on the input parameters,
-  /// and then use the context key to retrieve or create an AlgorithmCtx. The kernel function
-  /// will be launched with the AlgorithmCtx.
-  int launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count, DataType dtype,
-             cudaStream_t stream, std::unordered_map<std::string, std::shared_ptr<void>>& extras);
-  bool isEmpty();
-  std::string name() const;
-  std::string collective() const;
-
- private:
-  class Impl;
-  std::shared_ptr<Impl> impl_;
-};
 }  // namespace mscclpp

 namespace std {
@@ -103,72 +158,202 @@ struct hash<mscclpp::AlgorithmCtxKey> {

 namespace mscclpp {

-class AlgorithmBuilder {
+/// Native C++/CUDA implementation of a collective algorithm.
+///
+/// NativeAlgorithm allows users to implement custom collective algorithms in C++/CUDA.
+/// It provides a framework for initialization, context management, and kernel execution.
+/// Contexts are cached based on buffer configurations to avoid redundant setup.
+class NativeAlgorithm : public Algorithm {
 public:
-  virtual ~AlgorithmBuilder() = default;
-  virtual Algorithm build() = 0;
+  using InitFunc = std::function<void(std::shared_ptr<Communicator>)>;
+
+  /// Function type for the kernel that executes the collective operation.
+  /// @param ctx The algorithm context containing channels and semaphores.
+  /// @param input Pointer to the input buffer.
+  /// @param output Pointer to the output buffer.
+  /// @param inputSize Size of the input buffer in bytes.
+  /// @param outputSize Size of the output buffer in bytes.
+  /// @param dtype Data type of the elements.
+  /// @param op Reduction operation (for reduce-type collectives).
+  /// @param stream CUDA stream to execute on.
+  /// @param nBlocks Number of CUDA blocks.
+  /// @param nThreadsPerBlock Number of threads per block.
+  /// @param extras Additional algorithm-specific parameters.
+  /// @return The result of the operation.
+  using KernelFunc =
+      std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
+                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
+
+  /// Function type for creating algorithm contexts.
+  /// @param comm The communicator.
+  /// @param input Pointer to the input buffer.
+  /// @param output Pointer to the output buffer.
+  /// @param inputSize Size of the input buffer.
+  /// @param outputSize Size of the output buffer.
+  /// @param dtype Data type of the elements.
+  /// @return A shared pointer to the created context.
+  using ContextInitFunc =
+      std::function<std::shared_ptr<void>(std::shared_ptr<Communicator>, const void*, void*, size_t, size_t, DataType)>;
+
+  /// Function type for generating context keys.
+  /// @param input Pointer to the input buffer.
+  /// @param output Pointer to the output buffer.
+  /// @param inputSize Size of the input buffer.
+  /// @param outputSize Size of the output buffer.
+  /// @param dtype Data type of the elements.
+  /// @return A key uniquely identifying this buffer configuration.
+  using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
+                                                          size_t outputSize, DataType dtype)>;
+
+  /// Construct a NativeAlgorithm.
+  /// @param name Human-readable name of the algorithm.
+  /// @param collective The collective operation (e.g., "allreduce").
+  /// @param initFunc Function called once to initialize the algorithm.
+  /// @param kernelFunc Function that launches the CUDA kernel.
+  /// @param contextInitFunc Function that creates execution contexts.
+  /// @param contextKeyGenFunc Function that generates cache keys for contexts.
+  /// @param minMessageSize Minimum supported message size in bytes (default: 0).
+  /// @param maxMessageSize Maximum supported message size in bytes (default: UINT64_MAX).
+  /// @param bufferMode Buffer mode supported by this algorithm (default: ANY).
+  /// @param tags Tags for algorithm selection hints.
+  /// @param constraint Execution constraints (worldSize, nRanksPerNode).
+  NativeAlgorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
+                  ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc, size_t minMessageSize = 0,
+                  size_t maxMessageSize = UINT64_MAX, CollectiveBufferMode bufferMode = CollectiveBufferMode::Any,
+                  std::unordered_map<std::string, uint64_t> tags = {}, Constraint constraint = {});
+
+  CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+                     size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
+                     std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+  const std::string& name() const override;
+  const std::string& collective() const override;
+  const std::pair<size_t, size_t>& messageRange() const override;
+  const std::unordered_map<std::string, uint64_t>& tags() const override;
+  const CollectiveBufferMode& bufferMode() const override;
+  AlgorithmType type() const override { return AlgorithmType::Native; }
+  Constraint constraint() const override;
+  void reset() override;
+
+ private:
+  std::string name_;
+  std::string collective_;
+  NativeAlgorithm::InitFunc initFunc_;
+  NativeAlgorithm::KernelFunc kernelLaunchFunc_;
+  NativeAlgorithm::ContextInitFunc contextInitFunc_;
+  NativeAlgorithm::ContextKeyGenFunc contextKeyGenFunc_;
+  size_t minMessageSize_;
+  size_t maxMessageSize_;
+  CollectiveBufferMode bufferMode_;
+  std::unordered_map<std::string, uint64_t> tags_;
+  Constraint constraint_;
+  std::unordered_map<AlgorithmCtxKey, std::shared_ptr<void>> contexts_;
+
+  bool initialized_ = false;
 };

-using AlgoSelectFunc = std::function<Algorithm(
-    const std::unordered_map<std::string, std::unordered_map<std::string, Algorithm>>& algoMapByCollective,
-    std::string collective, const void* input, void* output, size_t messageSize, DataType dtype, int nRanksPerNode,
-    int worldSize)>;
+/// DSL-based implementation of a collective algorithm.
+///
+/// DslAlgorithm wraps an ExecutionPlan loaded from a DSL specification file.
+/// It implements both Algorithm and AlgorithmBuilder interfaces, allowing it
+/// to be used directly or registered with AlgorithmCollectionBuilder.
+class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enable_shared_from_this<DslAlgorithm> {
+ public:
+  /// Construct a DslAlgorithm from an execution plan.
+  /// @param id Identifier for this algorithm instance.
+  /// @param plan The execution plan defining the algorithm.
+  /// @param tags Tags for algorithm selection hints.
+  /// @param constraint Execution constraints (worldSize, nRanksPerNode).
+  DslAlgorithm(std::string id, ExecutionPlan plan, std::unordered_map<std::string, uint64_t> tags = {},
+               Constraint constraint = {});
+  const std::string& name() const override;
+  const std::string& collective() const override;
+  const std::pair<size_t, size_t>& messageRange() const override;
+  const std::unordered_map<std::string, uint64_t>& tags() const override;
+  const CollectiveBufferMode& bufferMode() const override;
+  CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+                     size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
+                     std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+  AlgorithmType type() const override { return AlgorithmType::DSL; }
+  Constraint constraint() const override;
+  void reset() override;

+  std::shared_ptr<Algorithm> build() override;
+
+ private:
+  ExecutionPlan plan_;
+  std::string id_;
+  std::unordered_map<std::string, uint64_t> tags_;
+  Constraint constraint_;
+};
+
+/// Request parameters for selecting and executing a collective operation.
+///
+/// This struct encapsulates all the information needed to select an appropriate
+/// algorithm for a collective operation.
+struct CollectiveRequest {
+  int worldSize;
+  int nRanksPerNode;
+  int rank;
+  const void* inputBuffer;
+  void* outputBuffer;
+  size_t messageSize;
+  const std::string& collective;
+  const DataType dtype;
+  const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
+
+  CollectiveBufferMode bufferMode() const;
+};
+
+/// Function type for custom algorithm selection.
+/// @param algoMapByCollective Map of collective names to available algorithms.
+/// @param request The collective request parameters.
+/// @return The selected algorithm, or nullptr if no suitable algorithm is found.
+using AlgoSelectFunc = std::function<std::shared_ptr<Algorithm>(
+    const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<Algorithm>>>&
+        algoMapByCollective,
+    const CollectiveRequest& request)>;
+
+/// Collection of algorithms for collective operations.
+///
+/// AlgorithmCollection manages a set of algorithms indexed by collective operation
+/// name and algorithm name. It provides methods to select the best algorithm for
+/// a given request and to register new algorithms.
 class AlgorithmCollection {
 public:
  AlgorithmCollection() = default;

-  /// @brief Select an algorithm based on the collective operation name and message size.
-  /// @param collective The collective operation name.
-  /// @param input The input buffer.
-  /// @param output The output buffer.
-  /// @param messageSize The message size.
-  /// @param dtype The data type.
-  /// @param nRanksPerNode The number of ranks per node.
-  /// @param worldSize The total number of ranks.
-  /// @return The selected algorithm. If no suitable algorithm is found, an empty Algorithm object is returned.
-  Algorithm selectAlgorithm(const std::string& collective, const void* input, void* output, size_t messageSize,
-                            DataType dtype, int nRanksPerNode, int worldSize);
+  /// Select an algorithm based on the collective operation name and message size.
+  /// @param request The collective request containing all necessary parameters.
+  /// @return The selected algorithm. If no suitable algorithm is found, nullptr is returned.
+  std::shared_ptr<Algorithm> selectAlgorithm(const CollectiveRequest& request);

-  /// @brief Register a new algorithm.
-  /// @param collective The collective operation name.
+  /// Register a new algorithm.
+  /// @param collective The collective operation name (e.g., "allreduce").
  /// @param algoName The algorithm name.
  /// @param algorithm The algorithm implementation.
-  void registerAlgorithm(const std::string collective, const std::string algoName, Algorithm algorithm);
+  void registerAlgorithm(const std::string collective, const std::string algoName,
+                         std::shared_ptr<Algorithm> algorithm);
+
+  /// Get all algorithms for a specific collective operation.
+  /// @param collective The collective operation name.
+  /// @return A map of algorithm names to algorithm instances.
+  std::unordered_map<std::string, std::shared_ptr<Algorithm>> getAlgorithmsByCollective(
+      const std::string& collective) const;
+
+  /// Get all registered algorithms.
+  /// @return A vector containing all algorithm instances.
+  std::vector<std::shared_ptr<Algorithm>> getAllAlgorithms() const;
+
+  /// Extend this collection with algorithms from another collection.
+  /// @param other The other AlgorithmCollection to merge in.
+  void extend(const AlgorithmCollection& other);
+
+  void setSelectors(AlgoSelectFunc algoSelector, AlgoSelectFunc fallbackAlgoSelector);

 private:
-  std::unordered_map<std::string, std::unordered_map<std::string, Algorithm>> algoMapByCollective_;
-  AlgoSelectFunc algoSelector_ = nullptr;
-  AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
-
-  friend class AlgorithmCollectionBuilder;
-};
-
-class AlgorithmCollectionBuilder {
- public:
-  static std::shared_ptr<AlgorithmCollectionBuilder> getInstance();
-
-  /// @brief Add a new algorithm builder for a specific collective operation.
-  /// @param builder The algorithm builder.
-  void addAlgorithmBuilder(std::shared_ptr<AlgorithmBuilder> builder);
-
-  /// @brief Set a new algorithm selection function.
-  /// @param selector The algorithm selection function.
-  void setAlgorithmSelector(AlgoSelectFunc selector);
-
-  /// @brief Set a fallback algorithm selection function.
-  /// @param selector The fallback algorithm selection function.
-  /// @details The fallback selector will be used if the primary selector returns an empty algorithm. MSCCL++ will
-  /// assign a predefined selector as the fallback selector.
-  void setFallbackAlgorithmSelector(AlgoSelectFunc selector);
-
-  /// @brief Build the AlgorithmCollection instance.
-  /// @return The AlgorithmCollection instance.
-  std::shared_ptr<AlgorithmCollection> build();
-
- private:
-  AlgorithmCollectionBuilder() = default;
-  std::vector<std::shared_ptr<AlgorithmBuilder>> algoBuilders_;
+  std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<Algorithm>>> algoMapByCollective_;
  AlgoSelectFunc algoSelector_ = nullptr;
  AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
 };
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -381,12 +381,12 @@ struct EndpointConfig {
  /// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other
  /// transports.
  struct Ib {
-    static const int DefaultPort = -1;
-    static const int DefaultGidIndex = 0;
-    static const int DefaultMaxCqSize = 1024;
-    static const int DefaultMaxCqPollNum = 1;
-    static const int DefaultMaxSendWr = 8192;
-    static const int DefaultMaxWrPerSend = 64;
+    static constexpr int DefaultPort = -1;
+    static constexpr int DefaultGidIndex = 0;
+    static constexpr int DefaultMaxCqSize = 1024;
+    static constexpr int DefaultMaxCqPollNum = 1;
+    static constexpr int DefaultMaxSendWr = 8192;
+    static constexpr int DefaultMaxWrPerSend = 64;

    /// Device index. Currently ignored; use transport type (IB0-IB7) to select device.
    int deviceIndex;
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -33,10 +33,10 @@ class ExecutionPlan {
  ~ExecutionPlan() = default;

  /// Return the human-readable name of the plan.
-  std::string name() const;
+  const std::string& name() const;

  /// Return the collective implemented by this plan (e.g., "allreduce", "allgather").
-  std::string collective() const;
+  const std::string& collective() const;

  /// Minimum message size (in bytes) for which this plan is valid.
  size_t minMessageSize() const;
@@ -54,95 +54,6 @@ class ExecutionPlan {
  friend class Executor;
 };

-/// Request parameters provided when executing a plan.
-struct ExecutionRequest {
-  int worldSize;
-  int nRanksPerNode;
-  int rank;
-  const void* inputBuffer;
-  void* outputBuffer;
-  size_t messageSize;
-  const std::string& collective;
-  const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
-
-  /// Whether the request indicates an in-place operation.
-  bool isInPlace() const;
-};
-
-/// A handle representing a specific execution plan along with its constraints and metadata.
-struct ExecutionPlanHandle {
-  /// Constraints that must be satisfied for the plan to be valid.
-  struct Constraint {
-    int worldSize;
-    int nRanksPerNode;
-  };
-
-  std::string id;                                  /// Unique identifier for the handle.
-  Constraint constraint;                           /// Constraints for plan applicability.
-  std::shared_ptr<ExecutionPlan> plan;             /// Backing ExecutionPlan instance.
-  std::unordered_map<std::string, uint64_t> tags;  /// Optional tags/metadata used by selector.
-
-  /// Create a new ExecutionPlanHandle.
-  /// @param id Unique id for the handle.
-  /// @param worldSize Required world size for the plan.
-  /// @param nRanksPerNode Required ranks-per-node for the plan.
-  /// @param plan The associated ExecutionPlan.
-  /// @param tags Optional tags used for selection.
-  static std::shared_ptr<ExecutionPlanHandle> create(const std::string& id, int worldSize, int nRanksPerNode,
-                                                     std::shared_ptr<ExecutionPlan> plan,
-                                                     const std::unordered_map<std::string, uint64_t>& tags = {});
-
-  /// Check whether the given ExecutionRequest satisfies this handle's parameters.
-  /// @param request The execution request to evaluate.
-  /// @return True if the request matches the handle parameters, false otherwise.
-  bool match(const ExecutionRequest& request);
-};
-
-/// Selector function type used to pick an ExecutionPlanHandle from a list of candidates.
-using ExecutionPlanSelector = std::function<std::shared_ptr<ExecutionPlanHandle>(
-    const std::vector<std::shared_ptr<ExecutionPlanHandle>> plans, const ExecutionRequest& request)>;
-
-/// Registry that holds available execution plans and performs selection logic.
-class ExecutionPlanRegistry {
- public:
-  /// Retrieve the singleton instance of the registry.
-  static std::shared_ptr<ExecutionPlanRegistry> getInstance();
-
-  /// Destructor.
-  ~ExecutionPlanRegistry();
-
-  /// Register a plan handle with the registry.
-  void registerPlan(const std::shared_ptr<ExecutionPlanHandle> planHandle);
-
-  /// Get all plan handles for a given collective name.
-  std::vector<std::shared_ptr<ExecutionPlanHandle>> getPlans(const std::string& collective);
-
-  /// Lookup a plan handle by id.
-  std::shared_ptr<ExecutionPlanHandle> get(const std::string& id);
-
-  /// Select a suitable plan handle for the given parameters.
-  std::shared_ptr<ExecutionPlanHandle> select(const std::string& collective, int worldSize, int nRanksPerNode, int rank,
-                                              const void* sendBuffer, void* recvBuffer, size_t messageSize,
-                                              const std::unordered_map<std::string, std::vector<uint64_t>>& hints);
-
-  /// Provide a custom selector function.
-  void setSelector(ExecutionPlanSelector selector);
-
-  /// Set the default selector used when no custom selector is provided.
-  void setDefaultSelector(ExecutionPlanSelector selector);
-
-  /// Load built-in/default plans for the given rank.
-  void loadDefaultPlans(int rank);
-
-  /// Clear all registered plans from the registry.
-  void clear();
-
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-  ExecutionPlanRegistry();
-};
-
 /// High-level executor responsible for invoking execution plans on a communicator.
 class Executor {
 public:
--- a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
+++ b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_COLLECTIVES_ALGORITHM_COLLECTION_BUILDER_HPP_
+#define MSCCLPP_EXT_COLLECTIVES_ALGORITHM_COLLECTION_BUILDER_HPP_
+
+#include <mscclpp/algorithm.hpp>
+
+namespace mscclpp {
+namespace collective {
+
+/// Builder for creating AlgorithmCollection instances.
+///
+/// AlgorithmCollectionBuilder provides a singleton interface for registering
+/// algorithm builders and configuring algorithm selection functions. It can
+/// build both default algorithms and custom algorithms registered by users.
+///
+/// Typical usage:
+///   1. Get the singleton instance with getInstance()
+///   2. Add algorithm builders with addAlgorithmBuilder()
+///   3. Optionally set custom selectors with setAlgorithmSelector()
+///   4. Build the collection with build() or buildDefaultAlgorithms()
+class AlgorithmCollectionBuilder {
+ public:
+  /// Get the singleton instance of the builder.
+  /// @return A shared pointer to the singleton instance.
+  static std::shared_ptr<AlgorithmCollectionBuilder> getInstance();
+
+  /// Reset the singleton instance.
+  static void reset();
+
+  /// Add a new algorithm builder.
+  /// @param builder The algorithm builder to add.
+  void addAlgorithmBuilder(std::shared_ptr<AlgorithmBuilder> builder);
+
+  /// Set a custom algorithm selection function.
+  /// @param selector The algorithm selection function.
+  void setAlgorithmSelector(AlgoSelectFunc selector);
+
+  /// Set a fallback algorithm selection function.
+  /// @param selector The fallback algorithm selection function.
+  /// @note The fallback selector is used if the primary selector returns nullptr.
+  ///       MSCCL++ assigns a predefined selector as the fallback by default.
+  void setFallbackAlgorithmSelector(AlgoSelectFunc selector);
+
+  /// Build the AlgorithmCollection instance.
+  /// @return The built AlgorithmCollection containing all registered algorithms.
+  AlgorithmCollection build();
+
+  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
+
+ private:
+  AlgorithmCollectionBuilder() = default;
+  std::vector<std::shared_ptr<AlgorithmBuilder>> algoBuilders_;
+  AlgoSelectFunc algoSelector_ = nullptr;
+  AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
+
+  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
+  AlgorithmCollection buildDefaultDslAlgorithms(int rank);
+
+  static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;
+};
+
+}  // namespace collective
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_COLLECTIVES_ALGORITHM_COLLECTION_BUILDER_HPP_
--- a/apps/nccl/include/mscclpp/nccl.h
+++ b/apps/nccl/include/mscclpp/nccl.h
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -6,10 +6,10 @@ add_subdirectory(test)

 add_custom_target(pytest_lib_copy ALL
    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/csrc/_mscclpp.*.so
+        ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
        ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
+        ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
        ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
    DEPENDS mscclpp_py mscclpp_py_test
 )
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -3,7 +3,7 @@

 find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
 include(FetchContent)
-FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
+FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2)
 FetchContent_MakeAvailable(nanobind)

 FetchContent_Declare(dlpack
@@ -21,6 +21,7 @@ endif()
 file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
 nanobind_add_module(mscclpp_py ${SOURCES})
 set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
-target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp_static ${GPU_LIBRARIES})
+set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
+target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
 target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
+
+#include <cstring>
+#include <mscclpp/algorithm.hpp>
+
+namespace nb = nanobind;
+using namespace mscclpp;
+
+void register_algorithm(nb::module_& m) {
+  nb::enum_<CollectiveBufferMode>(m, "CollectiveBufferMode")
+      .value("ANY", CollectiveBufferMode::Any)
+      .value("IN_PLACE", CollectiveBufferMode::InPlace)
+      .value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace);
+
+  nb::enum_<AlgorithmType>(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL);
+
+  nb::enum_<CommResult>(m, "CommResult")
+      .value("COMM_SUCCESS", CommResult::CommSuccess)
+      .value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError)
+      .value("COMM_SYSTEM_ERROR", CommResult::CommSystemError)
+      .value("COMM_INTERNAL_ERROR", CommResult::CommInternalError)
+      .value("COMM_INVALID_ARGUMENT", CommResult::CommInvalidArgument)
+      .value("COMM_INVALID_USAGE", CommResult::CommInvalidUsage)
+      .value("COMM_REMOTE_ERROR", CommResult::CommRemoteError)
+      .value("COMM_IN_PROGRESS", CommResult::CommInProgress)
+      .value("COMM_NUM_RESULTS", CommResult::CommNumResults);
+
+  nb::enum_<ReduceOp>(m, "ReduceOp")
+      .value("SUM", ReduceOp::SUM)
+      .value("MIN", ReduceOp::MIN)
+      .value("NOP", ReduceOp::NOP);
+
+  auto algorithmClass =
+      nb::class_<Algorithm>(m, "Algorithm")
+          .def_static(
+              "from_native_capsule",
+              [](nb::capsule cap) {
+                const char* name = cap.name();
+                if (name == nullptr || std::strcmp(name, ALGORITHM_NATIVE_CAPSULE_NAME) != 0) {
+                  throw nb::type_error("Invalid capsule: expected 'mscclpp::AlgorithmPtr'");
+                }
+                void* data = cap.data();
+                if (data == nullptr) {
+                  throw nb::value_error("Failed to get pointer from capsule");
+                }
+                return *static_cast<std::shared_ptr<Algorithm>*>(data);
+              },
+              nb::arg("capsule"))
+          .def_prop_ro("name", &Algorithm::name)
+          .def_prop_ro("collective", &Algorithm::collective)
+          .def_prop_ro("message_range", &Algorithm::messageRange)
+          .def_prop_ro("tags", &Algorithm::tags)
+          .def_prop_ro("buffer_mode", &Algorithm::bufferMode)
+          .def_prop_ro("constraint", &Algorithm::constraint)
+          .def_prop_ro("type", &Algorithm::type)
+          .def(
+              "execute",
+              [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
+                 size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
+                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
+                 std::unordered_map<std::string, uintptr_t> extras) {
+                return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
+                                    inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
+                                    nBlocks, nThreadsPerBlock, extras);
+              },
+              nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
+              nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
+              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
+
+  nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
+      .def(nb::init<>())
+      .def(nb::init<int, int>(), nb::arg("world_size"), nb::arg("n_ranks_per_node"))
+      .def_rw("world_size", &Algorithm::Constraint::worldSize)
+      .def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode);
+
+  nb::class_<AlgorithmBuilder>(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build);
+
+  nb::class_<DslAlgorithm, Algorithm>(m, "DslAlgorithm")
+      .def(nb::init<std::string, ExecutionPlan, std::unordered_map<std::string, uint64_t>, Algorithm::Constraint>(),
+           nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map<std::string, uint64_t>(),
+           nb::arg("constraint") = Algorithm::Constraint())
+      .def("build", &DslAlgorithm::build);
+
+  nb::class_<AlgorithmCollection>(m, "AlgorithmCollection")
+      .def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"),
+           nb::arg("algorithm"))
+      .def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective"))
+      .def("to_list", &AlgorithmCollection::getAllAlgorithms);
+
+  nb::class_<CollectiveRequest>(m, "CollectiveRequest")
+      .def_ro("world_size", &CollectiveRequest::worldSize)
+      .def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode)
+      .def_ro("rank", &CollectiveRequest::rank)
+      .def_prop_ro("input_buffer",
+                   [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.inputBuffer); })
+      .def_prop_ro("output_buffer",
+                   [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
+      .def_ro("message_size", &CollectiveRequest::messageSize)
+      .def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
+      .def_ro("dtype", &CollectiveRequest::dtype)
+      .def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
+      .def("buffer_mode", &CollectiveRequest::bufferMode);
+}
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -3,7 +3,6 @@

 #include <nanobind/nanobind.h>
 #include <nanobind/operators.h>
-#include <nanobind/stl/array.h>
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/vector.h>
@@ -26,6 +25,10 @@ extern void register_nvls(nb::module_& m);
 extern void register_executor(nb::module_& m);
 extern void register_npkit(nb::module_& m);
 extern void register_gpu_utils(nb::module_& m);
+extern void register_algorithm(nb::module_& m);
+
+// ext
+extern void register_algorithm_collection_builder(nb::module_& m);

 template <typename T>
 void def_shared_future(nb::handle& m, const std::string& typestr) {
@@ -36,6 +39,13 @@ void def_shared_future(nb::handle& m, const std::string& typestr) {
 void register_core(nb::module_& m) {
  m.def("version", &version);

+  nb::enum_<DataType>(m, "DataType")
+      .value("int32", DataType::INT32)
+      .value("uint32", DataType::UINT32)
+      .value("float16", DataType::FLOAT16)
+      .value("float32", DataType::FLOAT32)
+      .value("bfloat16", DataType::BFLOAT16);
+
  nb::class_<Bootstrap>(m, "Bootstrap")
      .def("get_rank", &Bootstrap::getRank)
      .def("get_n_ranks", &Bootstrap::getNranks)
@@ -61,7 +71,15 @@ void register_core(nb::module_& m) {
      .def("recv", static_cast<void (Bootstrap::*)(std::vector<char>&, int, int)>(&Bootstrap::recv), nb::arg("data"),
           nb::arg("peer"), nb::arg("tag"));

-  nb::class_<UniqueId>(m, "UniqueId");
+  nb::class_<UniqueId>(m, "UniqueId")
+      .def(nb::init<>())
+      .def("__setstate__",
+           [](UniqueId& self, nb::bytes b) {
+             if (nb::len(b) != UniqueIdBytes) throw std::runtime_error("Invalid UniqueId byte size");
+             ::memcpy(self.data(), b.c_str(), UniqueIdBytes);
+           })
+      .def("__getstate__",
+           [](const UniqueId& self) { return nb::bytes(reinterpret_cast<const char*>(self.data()), UniqueIdBytes); });

  nb::class_<TcpBootstrap, Bootstrap>(m, "TcpBootstrap")
      .def(nb::init<int, int>(), "Do not use this constructor. Use create instead.")
@@ -284,4 +302,8 @@ NB_MODULE(_mscclpp, m) {
  register_executor(m);
  register_npkit(m);
  register_gpu_utils(m);
+  register_algorithm(m);
+
+  // ext
+  register_algorithm_collection_builder(m);
 }
--- a/python/csrc/executor_py.cpp
+++ b/python/csrc/executor_py.cpp
@@ -15,50 +15,8 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_executor(nb::module_& m) {
-  nb::enum_<DataType>(m, "DataType")
-      .value("int32", DataType::INT32)
-      .value("uint32", DataType::UINT32)
-      .value("float16", DataType::FLOAT16)
-      .value("float32", DataType::FLOAT32)
-      .value("bfloat16", DataType::BFLOAT16);
-
  nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);

-  nb::class_<ExecutionRequest>(m, "ExecutionRequest")
-      .def_ro("world_size", &ExecutionRequest::worldSize)
-      .def_ro("n_ranks_per_node", &ExecutionRequest::nRanksPerNode)
-      .def_prop_ro(
-          "input_buffer",
-          [](const ExecutionRequest& self) -> uintptr_t { return reinterpret_cast<uintptr_t>(self.inputBuffer); })
-      .def_prop_ro(
-          "output_buffer",
-          [](const ExecutionRequest& self) -> uintptr_t { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
-      .def_ro("message_size", &ExecutionRequest::messageSize)
-      .def_prop_ro("collective", [](ExecutionRequest& self) -> const std::string& { return self.collective; })
-      .def_prop_ro("hints", [](ExecutionRequest& self) { return self.hints; });
-
-  nb::class_<ExecutionPlanHandle>(m, "ExecutionPlanHandle")
-      .def_ro("id", &ExecutionPlanHandle::id)
-      .def_ro("constraint", &ExecutionPlanHandle::constraint)
-      .def_ro("plan", &ExecutionPlanHandle::plan)
-      .def_ro("tags", &ExecutionPlanHandle::tags)
-      .def_static("create", &ExecutionPlanHandle::create, nb::arg("id"), nb::arg("world_size"),
-                  nb::arg("nranks_per_node"), nb::arg("plan"),
-                  nb::arg("tags") = std::unordered_map<std::string, uint64_t>{});
-
-  nb::class_<ExecutionPlanHandle::Constraint>(m, "ExecutionPlanConstraint")
-      .def_ro("world_size", &ExecutionPlanHandle::Constraint::worldSize)
-      .def_ro("n_ranks_per_node", &ExecutionPlanHandle::Constraint::nRanksPerNode);
-
-  nb::class_<ExecutionPlanRegistry>(m, "ExecutionPlanRegistry")
-      .def_static("get_instance", &ExecutionPlanRegistry::getInstance)
-      .def("register_plan", &ExecutionPlanRegistry::registerPlan, nb::arg("planHandle"))
-      .def("get_plans", &ExecutionPlanRegistry::getPlans, nb::arg("collective"))
-      .def("get", &ExecutionPlanRegistry::get, nb::arg("id"))
-      .def("set_selector", &ExecutionPlanRegistry::setSelector, nb::arg("selector"))
-      .def("set_default_selector", &ExecutionPlanRegistry::setDefaultSelector, nb::arg("selector"))
-      .def("clear", &ExecutionPlanRegistry::clear);
-
  nb::class_<ExecutionPlan>(m, "ExecutionPlan")
      .def(nb::init<const std::string&, int>(), nb::arg("planPath"), nb::arg("rank"))
      .def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); })
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
+
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/ext/collectives/algorithm_collection_builder.hpp>
+
+namespace nb = nanobind;
+using namespace mscclpp;
+using namespace mscclpp::collective;
+
+void register_algorithm_collection_builder(nb::module_& m) {
+  nb::class_<AlgorithmCollectionBuilder>(m, "AlgorithmCollectionBuilder")
+      .def_static("get_instance", &AlgorithmCollectionBuilder::getInstance)
+      .def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder"))
+      .def(
+          "add_dsl_algorithm_builder",
+          [](AlgorithmCollectionBuilder& self, std::shared_ptr<DslAlgorithm> algorithm) {
+            self.addAlgorithmBuilder(algorithm);
+          },
+          nb::arg("algorithm"))
+      .def("set_algorithm_selector", &AlgorithmCollectionBuilder::setAlgorithmSelector, nb::arg("selector"))
+      .def("set_fallback_algorithm_selector", &AlgorithmCollectionBuilder::setFallbackAlgorithmSelector,
+           nb::arg("selector"))
+      .def("build", &AlgorithmCollectionBuilder::build)
+      .def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
+           nb::arg("scratch_buffer_size"), nb::arg("rank"))
+      .def_static("reset", &AlgorithmCollectionBuilder::reset);
+}
--- a/python/mscclpp/init.py
+++ b/python/mscclpp/init.py
@@ -3,43 +3,26 @@

 """MSCCL++ Python API."""

-import atexit
-from dataclasses import dataclass
-from functools import cached_property, wraps
-import inspect
-import json
+from functools import wraps
 import os
-from pathlib import Path
-from typing import Any
 import warnings

-from blake3 import blake3

-from mscclpp.language.program import CollectiveProgram
-from mscclpp.language.utils import AlgoSpec
 from functools import wraps
-from mscclpp._version import __version__, __commit_id__
+from ._version import __version__, __commit_id__

 if os.environ.get("MSCCLPP_HOME", None) is None:
    os.environ["MSCCLPP_HOME"] = os.path.abspath(os.path.dirname(__file__))

-
 # Parse the version
 version = {
    "version": __version__,
    "git_commit": __commit_id__,
 }

+from ._core import *

 from ._mscclpp import (
-    Env,
-    ErrorCode,
-    BaseError,
-    Error,
-    SysError,
-    CudaError,
-    CuError,
-    IbError,
    Device,
    DeviceType,
    Communicator,
@@ -60,16 +43,15 @@ from ._mscclpp import (
    Transport,
    TransportFlags,
    DataType,
+    ErrorCode,
    Executor,
    ExecutionPlan,
-    ExecutionPlanConstraint,
    PacketType,
    RawGpuBuffer,
+    ReduceOp,
    env,
    is_nvls_supported,
    npkit,
-    ExecutionPlanHandle as _ExecutionPlanHandle,
-    ExecutionPlanRegistry as _ExecutionPlanRegistry,
 )

 __all__ = [
@@ -79,6 +61,7 @@ __all__ = [
    "Connection",
    "connect_nvls_collective",
    "EndpointConfig",
+    "ErrorCode",
    "Fifo",
    "Semaphore",
    "Host2DeviceSemaphore",
@@ -97,6 +80,7 @@ __all__ = [
    "ExecutionPlan",
    "PacketType",
    "RawGpuBuffer",
+    "ReduceOp",
    "env",
    "version",
    "is_nvls_supported",
@@ -107,6 +91,11 @@ __all__ = [
    "version",
    "get_include",
    "get_lib",
+    # Python API
+    "Algorithm",
+    "AlgorithmCollection",
+    "CommGroup",
+    "GpuBuffer",
 ]


@@ -135,193 +124,5 @@ def deprecated(new_cls):
    return decorator


-class ExecutionPlanHandle:
-
-    def __init__(self, handle: _ExecutionPlanHandle):
-        self._handle = handle
-
-    @cached_property
-    def id(self) -> int:
-        return self._handle.id
-
-    @cached_property
-    def tags(self) -> set:
-        return frozenset(self._handle.tags)
-
-    @cached_property
-    def plan(self) -> ExecutionPlan:
-        return self._handle.plan
-
-    @cached_property
-    def constraints(self) -> ExecutionPlanConstraint:
-        return self._handle.constraints
-
-
-@dataclass(frozen=True)
-class ExecutionRequest:
-    collective: str
-    world_size: int
-    n_ranks_per_node: int
-    send_buffer: int
-    recv_buffer: int
-    message_size: int
-    hints: dict
-
-
-class ExecutionPlanRegistry:
-    _instance = None
-
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super(ExecutionPlanRegistry, cls).__new__(cls)
-        return cls._instance
-
-    def __init__(self):
-        if not hasattr(self, "_initialized"):
-            self._registry = _ExecutionPlanRegistry.get_instance()
-            self._id_map = {}
-            self._collective_map = {}
-            self._selector = None
-            self._initialized = True
-
-    def register_plan(self, plan: ExecutionPlanHandle):
-        self._id_map[plan.id] = plan
-        if plan.plan.collective not in self._collective_map:
-            self._collective_map[plan.plan.collective] = []
-        self._collective_map[plan.plan.collective].append(plan)
-        return self._instance._registry.register_plan(plan._handle)
-
-    def set_selector(self, selector):
-        self._selector = selector
-        self._instance._registry.set_selector(selector)
-
-    def set_default_selector(self, selector):
-        self._selector = selector
-        self._instance._registry.set_default_selector(selector)
-
-    def get(self, id: str) -> ExecutionPlanHandle:
-        return self._id_map.get(id, None)
-
-    def select(
-        self,
-        collective: str,
-        world_size: int,
-        n_ranks_per_node: int,
-        send_buffer: int,
-        recv_buffer: int,
-        message_size: int,
-        hints: dict = {},
-    ) -> ExecutionPlanHandle:
-        if self._selector is None or collective not in self._collective_map:
-            return None
-        req = ExecutionRequest(
-            collective=collective,
-            world_size=world_size,
-            n_ranks_per_node=n_ranks_per_node,
-            send_buffer=send_buffer,
-            recv_buffer=recv_buffer,
-            message_size=message_size,
-            hints=hints,
-        )
-        return self._selector(self._collective_map[collective], req)
-
-    @classmethod
-    def reset_instance(cls):
-        if cls._instance is not None:
-            cls._instance._registry.clear()
-            cls._instance._id_map = {}
-            cls._instance._collective_map = {}
-            cls._instance._selector = None
-            cls._instance = None
-
-
-atexit.register(ExecutionPlanRegistry.reset_instance)
-
-_execution_plan_registry = ExecutionPlanRegistry()
-
-
-def _stable_json_bytes(obj: Any) -> bytes:
-    return json.dumps(
-        obj,
-        sort_keys=True,
-        ensure_ascii=False,
-        separators=(",", ":"),
-    ).encode("utf-8")
-
-
-def compile(
-    algo,
-    algo_spec: AlgoSpec,
-    rank: int,
-    **kwargs,
-) -> ExecutionPlanHandle:
-    """Compile a MSCCL++ program from a high-level algorithm description.
-    Args:
-        algo: The high-level algorithm description (e.g., a function or class).
-        algo_spec (AlgoSpec): Algorithm specification containing collective type,
-            world size, ranks per node, instances, protocol, and other configuration.
-        rank (int): The rank of the current process.
-        **kwargs: Additional keyword arguments passed to the algorithm function.
-    Returns:
-        ExecutionPlanHandle: The compiled execution plan handle.
-    Raises:
-        ValueError: If the 'algo' argument is not callable.
-    """
-    if not callable(algo):
-        raise ValueError("The 'algo' argument must be a callable (e.g., a function or class).")
-    prog: CollectiveProgram = algo(
-        algo_spec,
-        **kwargs,
-    )
-    source = inspect.getsource(algo)
-
-    source_hash = blake3(source.encode("utf-8")).hexdigest()
-    plan_id = blake3(
-        _stable_json_bytes(
-            {
-                "version": __version__,
-                "algo_name": algo_spec.name,
-                "collective": algo_spec.collective.name,
-                "tags": sorted(algo_spec.tags.items()),
-                "source_hash": source_hash,
-                "envs": {
-                    "nranks_per_node": algo_spec.nranks_per_node,
-                    "world_size": algo_spec.world_size,
-                    "instances": algo_spec.instances,
-                    "protocol": algo_spec.protocol,
-                },
-            }
-        )
-    ).hexdigest()
-    plan_handle = _execution_plan_registry.get(plan_id)
-    if plan_handle is not None:
-        return plan_handle
-
-    plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
-    os.makedirs(plan_dir, exist_ok=True)
-    filename = f"{plan_id}.json"
-    plan_path = os.path.join(plan_dir, filename)
-    tmp_path = plan_path + f".tmp.{os.getpid()}"
-    if not os.path.exists(plan_path):
-        try:
-            # TODO (binyli): Each rank could generate its own execution plan separately. Doesn't need to generate whole plan.
-            with open(tmp_path, "w") as f:
-                prog.post_process_operations()
-                f.write(prog.to_json(indent=None, separators=(",", ":"), ensure_ascii=False))
-                f.flush()
-                os.fsync(f.fileno())
-            if not os.path.exists(plan_path):
-                os.rename(tmp_path, plan_path)
-            else:
-                os.remove(tmp_path)
-        except Exception:
-            Path(plan_path).unlink(missing_ok=True)
-    execution_plan = ExecutionPlan(plan_path, rank)
-    handle = _ExecutionPlanHandle.create(
-        id=plan_id,
-        world_size=algo_spec.world_size,
-        nranks_per_node=algo_spec.nranks_per_node,
-        plan=execution_plan,
-        tags=algo_spec.tags,
-    )
-    return ExecutionPlanHandle(handle)
+compile: DslCompiler = DslCompiler()
+compile_native: NativeCodeCompiler = NativeCodeCompiler()
--- a/python/mscclpp/_core/init.py
+++ b/python/mscclpp/_core/init.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from .algorithm import *
+from .comm import *
+from .compiler import *
+from .buffer import *
+
+__all__ = []
+__all__ += algorithm.__all__
+__all__ += comm.__all__
+__all__ += compiler.__all__
+__all__ += buffer.__all__
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -0,0 +1,230 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from __future__ import annotations
+from typing import Optional, Tuple, Dict
+from functools import cached_property
+
+
+from mscclpp._mscclpp import (
+    Algorithm as _Algorithm,
+    DslAlgorithm as _DslAlgorithm,
+    AlgorithmType as _AlgorithmType,
+    Communicator,
+    CollectiveBufferMode,
+    DataType,
+    Executor,
+    ExecutionPlan,
+    ReduceOp,
+)
+
+__all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
+
+
+class Algorithm:
+    """A wrapper for collective communication algorithms.
+
+    This class provides a Python interface for collective communication algorithms
+    such as allreduce, allgather, and reduce-scatter. Algorithms can be either
+    DSL-based (defined using MSCCL++ execution plans) or native (implemented in C++/CUDA).
+
+    Attributes:
+        name: Human-readable name of the algorithm.
+        collective: The collective operation this algorithm implements (e.g., "allreduce").
+        message_size_range: Tuple of (min_size, max_size) in bytes for valid message sizes.
+        tags: Dictionary of tag names to tag values for algorithm selection hints.
+        buffer_mode: The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY).
+    """
+
+    class Constraint:
+        """Constraints that define valid execution environments for the algorithm.
+
+        Args:
+            world_size: Required world size (number of ranks). 0 means any size.
+            n_ranks_per_node: Required number of ranks per node. 0 means any.
+        """
+
+        def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0):
+            self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node)
+
+        @property
+        def world_size(self) -> int:
+            return self._constraint.worldSize
+
+        @property
+        def n_ranks_per_node(self) -> int:
+            return self._constraint.nRanksPerNode
+
+    def __init__(
+        self,
+        id: Optional[str] = None,
+        execution_plan: Optional[ExecutionPlan] = None,
+        native_handle: Optional[_Algorithm] = None,
+        tags: Optional[Dict[str, int]] = None,
+        constraint: Optional[Constraint] = None,
+    ):
+        if execution_plan is not None:
+            self._algorithm = _DslAlgorithm(
+                id,
+                execution_plan,
+                tags=tags if tags is not None else {},
+                constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(),
+            )
+        elif native_handle is not None:
+            self._algorithm = native_handle
+
+    @classmethod
+    def create_from_native_handle(cls, handle: _Algorithm):
+        """Create an Algorithm instance from a native C++ algorithm handle.
+
+        Args:
+            handle: The native C++ algorithm handle.
+
+        Returns:
+            A new Algorithm instance wrapping the native handle.
+        """
+        return cls(
+            native_handle=handle,
+        )
+
+    @classmethod
+    def create_from_native_capsule(cls, obj):
+        """Create an Algorithm instance from a PyCapsule object.
+
+        Args:
+            obj: A PyCapsule containing a native algorithm pointer.
+
+        Returns:
+            A new Algorithm instance wrapping the algorithm from the capsule.
+        """
+        handle = _Algorithm.from_native_capsule(obj)
+        return cls(native_handle=handle)
+
+    @cached_property
+    def name(self) -> str:
+        """The human-readable name of the algorithm."""
+        return self._algorithm.name
+
+    @cached_property
+    def collective(self) -> str:
+        """The collective operation this algorithm implements (e.g., "allreduce", "allgather")."""
+        return self._algorithm.collective
+
+    @cached_property
+    def message_size_range(self) -> Tuple[int, int]:
+        """The valid message size range (min_size, max_size) in bytes."""
+        return (self._algorithm.message_range[0], self._algorithm.message_range[1])
+
+    @cached_property
+    def tags(self) -> Dict[str, int]:
+        """Dictionary of tag names to tag values for algorithm selection hints."""
+        return self._algorithm.tags
+
+    @cached_property
+    def buffer_mode(self) -> CollectiveBufferMode:
+        """The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY)."""
+        return self._algorithm.buffer_mode
+
+    def is_dsl_algorithm(self) -> bool:
+        """Check if this is a DSL-based algorithm.
+
+        Returns:
+            True if this algorithm is defined using DSL/execution plan, False otherwise.
+        """
+        if self._algorithm.type == _AlgorithmType.DSL:
+            return True
+        return False
+
+    def is_native_algorithm(self) -> bool:
+        """Check if this is a native C++/CUDA algorithm.
+
+        Returns:
+            True if this algorithm is implemented natively, False otherwise.
+        """
+        if self._algorithm.type == _AlgorithmType.NATIVE:
+            return True
+        return False
+
+    def execute(
+        self,
+        comm: Communicator,
+        input_buffer: int,
+        output_buffer: int,
+        input_size: int,
+        output_size: int,
+        dtype: DataType,
+        op: ReduceOp = ReduceOp.NOP,
+        stream: int = 0,
+        executor: Optional[Executor] = None,
+        nblocks=0,
+        nthreads_per_block=0,
+        extras: Optional[Dict[str, int]] = None,
+    ) -> int:
+        """Execute the collective algorithm.
+
+        Args:
+            comm: The communicator to use.
+            input_buffer: Device pointer to the input buffer.
+            output_buffer: Device pointer to the output buffer.
+            input_size: Size of the input buffer in bytes.
+            output_size: Size of the output buffer in bytes.
+            dtype: Data type of the elements.
+            op: Reduction operation for reduce-type collectives (default: NOP).
+            stream: CUDA stream to execute on (default: 0).
+            executor: The executor for DSL algorithms (required for DSL, optional for native).
+            nblocks: Number of CUDA blocks (0 for auto-selection).
+            nthreads_per_block: Number of threads per block (0 for auto-selection).
+            extras: Additional algorithm-specific parameters.
+
+        Returns:
+            The result code (0 for success).
+        """
+        return self._algorithm.execute(
+            comm,
+            int(input_buffer),
+            int(output_buffer),
+            input_size,
+            output_size,
+            dtype,
+            op,
+            int(stream),
+            executor,
+            nblocks,
+            nthreads_per_block,
+            extras if extras is not None else {},
+        )
+
+
+class AlgorithmBuilder:
+    def __init__(self, algorithm_builder: _AlgorithmBuilder):
+        self._algorithm_builder = algorithm_builder
+
+    def build(self) -> Algorithm:
+        return Algorithm.create_from_native_handle(self._algorithm_builder.build())
+
+
+class AlgorithmCollection:
+    def __init__(self, native_collection: _AlgorithmCollection):
+        self._native_collection = native_collection
+        self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()]
+
+    def __iter__(self):
+        """Iterate over all algorithms in the collection."""
+        return iter(self._algorithms)
+
+    def __len__(self):
+        """Return the number of algorithms in the collection."""
+        return len(self._algorithms)
+
+    def __getitem__(self, index: int) -> Algorithm:
+        """Get an algorithm by index."""
+        return self._algorithms[index]
+
+    def get_by_collective(self, collective: str):
+        """Get all algorithms for a specific collective operation."""
+        return [algo for algo in self._algorithms if algo.collective == collective]
+
+    def register_algorithm(self, collective: str, algo_name: str, algorithm: Algorithm):
+        """Register an algorithm for a collective operation."""
+        self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
+        self._algorithms.append(algorithm)
--- a/python/mscclpp/_core/buffer.py
+++ b/python/mscclpp/_core/buffer.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+from typing import Union, Tuple
+
+import cupy as cp
+import numpy as np
+from mscclpp._mscclpp import RawGpuBuffer
+
+__all__ = ["GpuBuffer"]
+
+
+class GpuBuffer(cp.ndarray):
+    def __new__(
+        cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
+    ):
+        # Check if `shape` is valid
+        if isinstance(shape, int):
+            shape = (shape,)
+        try:
+            shape = tuple(shape)
+        except TypeError:
+            raise ValueError("Shape must be a tuple-like or an integer.")
+        if any(s <= 0 for s in shape):
+            raise ValueError("Shape must be positive.")
+        # Create the buffer
+        buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
+        return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -2,22 +2,19 @@
 # Licensed under the MIT license.

 from __future__ import annotations
-from typing import Tuple, Type
+from typing import Type

 import cupy as cp
-from ._mscclpp import (
+from mscclpp._mscclpp import (
    Communicator,
    Connection,
    connect_nvls_collective,
    EndpointConfig,
    Semaphore,
-    Host2DeviceSemaphore,
-    Host2HostSemaphore,
    ProxyService,
    RegisteredMemory,
    PortChannel,
    MemoryChannel,
-    MemoryDevice2DeviceSemaphore,
    TcpBootstrap,
    Transport,
    TransportFlags,
@@ -27,6 +24,8 @@ import numpy as np

 from mscclpp.utils import is_torch_tensor

+__all__ = ["CommGroup"]
+

 class CommGroup:
    def __init__(
--- a/python/mscclpp/_core/compiler.py
+++ b/python/mscclpp/_core/compiler.py
@@ -0,0 +1,350 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+from __future__ import annotations
+import importlib.util
+import inspect
+import logging
+import json
+import os
+import subprocess
+import fcntl
+from typing import Any, Callable
+from pathlib import Path
+
+import pybind11
+import sys
+import sysconfig
+
+from blake3 import blake3
+import cupy as cp
+
+from mscclpp._version import __version__
+from .algorithm import Algorithm
+from mscclpp.language.program import CollectiveProgram
+from mscclpp.language.utils import AlgoSpec
+from mscclpp.utils import get_device_arch
+
+from mscclpp._mscclpp import (
+    ExecutionPlan,
+)
+
+logging.basicConfig(level=logging.INFO)
+
+__all__ = ["DslCompiler", "NativeCodeCompiler"]
+
+
+def _stable_json_bytes(obj: Any) -> bytes:
+    return json.dumps(
+        obj,
+        sort_keys=True,
+        ensure_ascii=False,
+        separators=(",", ":"),
+    ).encode("utf-8")
+
+
+class DslCompiler:
+    """Compiler for MSCCL++ DSL (Domain-Specific Language) algorithms.
+
+    This compiler transforms high-level algorithm descriptions written in Python
+    into execution plans that can be run on GPUs. The compiled plans are cached
+    to disk for reuse.
+
+    The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR`
+    environment variable (defaults to `~/.cache/mscclpp`).
+
+    Example:
+        >>> compiler = DslCompiler()
+        >>> algo = compiler.compile(my_allreduce_algo, algo_spec, rank=0)
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, algo: Callable[..., CollectiveProgram], algo_spec: AlgoSpec, rank: int, **kwds) -> Algorithm:
+        return self.compile(algo, algo_spec, rank, **kwds)
+
+    def compile(
+        self,
+        algo: Callable[..., CollectiveProgram],
+        algo_spec: AlgoSpec,
+        rank: int,
+        **kwargs,
+    ) -> Algorithm:
+        """Compile a MSCCL++ DSL program from a high-level algorithm description.
+
+        This method takes a Python function that defines a collective communication
+        algorithm and compiles it into an executable Algorithm. The compilation
+        result is cached based on a hash of the source code and algorithm specification.
+
+        Args:
+            algo: A callable (function or class) that takes an AlgoSpec and returns
+                a CollectiveProgram. This defines the communication pattern.
+            algo_spec: Algorithm specification containing:
+                - collective: The collective operation type (e.g., allreduce, allgather)
+                - world_size: Total number of ranks
+                - nranks_per_node: Number of ranks per node
+                - instances: Number of algorithm instances
+                - protocol: Communication protocol to use
+                - name: Human-readable algorithm name
+                - tags: Dictionary of tags for algorithm selection
+            rank: The rank of the current process (0 to world_size-1).
+            **kwargs: Additional keyword arguments passed to the algorithm function.
+
+        Returns:
+            Algorithm: The compiled algorithm ready for execution.
+
+        Raises:
+            ValueError: If the 'algo' argument is not callable.
+
+        Note:
+            Compiled execution plans are cached to disk. The cache key is computed
+            from the algorithm source code, specification, and MSCCL++ version.
+            Subsequent calls with the same inputs will reuse the cached plan.
+
+        Example:
+            >>> def my_ring_allreduce(spec: AlgoSpec) -> CollectiveProgram:
+            ...     # Define algorithm using MSCCL++ DSL
+            ...     ...
+            >>> compiler = DslCompiler()
+            >>> spec = AlgoSpec(collective=Collective.allreduce, world_size=8, ...)
+            >>> algo = compiler.compile(my_ring_allreduce, spec, rank=0)
+        """
+        if not callable(algo):
+            raise ValueError("The 'algo' argument must be a callable (e.g., a function or class).")
+        prog: CollectiveProgram = algo(
+            algo_spec,
+            **kwargs,
+        )
+        source = inspect.getsource(algo)
+
+        source_hash = blake3(source.encode("utf-8")).hexdigest()
+        plan_id = blake3(
+            _stable_json_bytes(
+                {
+                    "version": __version__,
+                    "algo_name": algo_spec.name,
+                    "collective": algo_spec.collective.name,
+                    "tags": sorted(algo_spec.tags.items()),
+                    "source_hash": source_hash,
+                    "envs": {
+                        "nranks_per_node": algo_spec.nranks_per_node,
+                        "world_size": algo_spec.world_size,
+                        "instances": algo_spec.instances,
+                        "protocol": algo_spec.protocol,
+                    },
+                }
+            )
+        ).hexdigest()
+
+        plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
+        os.makedirs(plan_dir, exist_ok=True)
+        filename = f"{plan_id}.json"
+        plan_path = os.path.join(plan_dir, filename)
+        tmp_path = plan_path + f".tmp.{os.getpid()}"
+        if not os.path.exists(plan_path):
+            try:
+                # TODO (binyli): Each rank could generate its own execution plan separately. Doesn't need to generate whole plan.
+                with open(tmp_path, "w") as f:
+                    prog.post_process_operations()
+                    f.write(prog.to_json(indent=None, separators=(",", ":"), ensure_ascii=False))
+                    f.flush()
+                    os.fsync(f.fileno())
+                if not os.path.exists(plan_path):
+                    os.rename(tmp_path, plan_path)
+                else:
+                    os.remove(tmp_path)
+            except Exception:
+                Path(plan_path).unlink(missing_ok=True)
+        execution_plan = ExecutionPlan(plan_path, rank)
+        return Algorithm(
+            id=plan_id,
+            execution_plan=execution_plan,
+            constraint=Algorithm.Constraint(
+                world_size=algo_spec.world_size, n_ranks_per_node=algo_spec.nranks_per_node
+            ),
+            tags=algo_spec.tags,
+        )
+
+
+class NativeCodeCompiler:
+    """Compiler for native CUDA/HIP algorithm implementations.
+
+    This compiler takes CUDA or HIP source files containing custom collective
+    algorithm kernels and compiles them into loadable Python modules using
+    pybind11 bindings.
+
+    The compiler automatically detects whether to use NVCC (CUDA) or HIPCC (ROCm)
+    based on the runtime environment. Compiled modules are cached to avoid
+    recompilation.
+
+    The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR`
+    environment variable (defaults to `~/.cache/mscclpp/native`).
+
+    Attributes:
+        _is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA.
+        _device_arch: The GPU architecture string (e.g., "sm_90" or "gfx90a").
+        _compiler: Path to the compiler executable (nvcc or hipcc).
+
+    Example:
+        >>> compiler = NativeCodeCompiler()
+        >>> module = compiler.compile("my_kernel", "path/to/kernel.cu")
+        >>> algo = module.create_algorithm()
+    """
+
+    def __init__(self):
+        self._is_hip = cp.cuda.runtime.is_hip
+        self._device_arch = get_device_arch()
+        self._compiler = self._get_compiler()
+        self._default_options = ["-std=c++17", "-O3", "--shared"]
+        python_include = sysconfig.get_path("include")
+        pybind11_include = pybind11.get_include()
+        self._default_options += [f"-I{python_include}", f"-I{pybind11_include}"]
+
+        python_lib = f"-lpython{sys.version_info.major}.{sys.version_info.minor}"
+        self._default_options.append(python_lib)
+        self._lib_home = os.path.abspath(os.path.dirname(__file__))
+
+        if not self._is_hip:
+            # Format: -gencode=arch=compute_90,code=sm_90
+            compute_arch = self._device_arch.replace("sm_", "compute_")
+            arch_flag = f"-gencode=arch={compute_arch},code={self._device_arch}"
+            self._default_options.append(arch_flag)
+            self._default_options += ["--compiler-options", "-fPIC"]
+            self._default_options += ["--linker-options", f"-rpath,{self._lib_home}/lib"]
+        else:
+            # Format for HIP: --offload-arch=gfx90a
+            arch_flag = f"--offload-arch={self._device_arch}"
+            self._default_options.append(arch_flag)
+            self._default_options += ["-fPIC"]
+            self._default_options += ["-D__HIP_PLATFORM_AMD__"]
+            self._default_options += [f"-Wl,-rpath,{self._lib_home}/lib"]
+
+        self._default_options = self._default_options + [
+            "-I" + os.path.join(self._lib_home, "include"),
+            "-L" + os.path.join(self._lib_home, "lib"),
+            "-lmscclpp",
+        ]
+        cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native")
+        self._cache_dir = Path(cache_root)
+        self._cache_dir.mkdir(parents=True, exist_ok=True)
+
+    def _get_compiler(self) -> str:
+        """Get the path to the appropriate compiler.
+
+        Returns:
+            Path to nvcc (CUDA) or hipcc (ROCm) compiler.
+        """
+        if self._is_hip:
+            rocm_home = os.environ.get("ROCM_HOME")
+            return os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
+        else:
+            cuda_home = os.environ.get("CUDA_HOME")
+            return os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
+
+    def get_arch(self):
+        """Get the target GPU architecture.
+
+        Returns:
+            str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD).
+        """
+        return self._device_arch
+
+    def __call__(self, name: str, file: str, **kwds):
+        return self.compile(name, file, **kwds)
+
+    def compile(self, name: str, file: str):
+        """Compile a native CUDA/HIP source file into a Python module.
+
+        This method compiles a CUDA (.cu) or HIP source file containing custom
+        collective algorithm kernels into a dynamically loadable Python module.
+        The module is expected to use pybind11 bindings to expose algorithm
+        creation functions.
+
+        Compilation results are cached based on a hash of the source code,
+        compiler options, and GPU architecture. Subsequent calls with unchanged
+        inputs will return the cached module.
+
+        Args:
+            name: The name of the Python module to create. This will be the
+                module name used for importing (e.g., `import name`).
+            file: Path to the CUDA/HIP source file to compile.
+
+        Returns:
+            module: The compiled and loaded Python module containing the
+                algorithm implementation.
+
+        Raises:
+            FileNotFoundError: If the specified source file does not exist.
+            RuntimeError: If compilation fails (compiler not found, syntax errors, etc.).
+            ImportError: If the compiled module cannot be loaded.
+
+        Note:
+            - The source file should include pybind11 bindings to expose functions.
+            - MSCCLPP headers are automatically included in the compilation.
+            - The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native).
+            - File locking is used to prevent race conditions during parallel compilation.
+
+        Example:
+            >>> compiler = NativeCodeCompiler()
+            >>> # Compile a custom allreduce kernel
+            >>> module = compiler.compile("my_allreduce", "kernels/allreduce.cu")
+            >>> # Use the module to create an algorithm
+            >>> algo = module.create_allreduce_algorithm(comm, buffer, size)
+        """
+        if not os.path.isfile(file):
+            raise FileNotFoundError(f"The specified source file does not exist: {file}")
+
+        with open(file, "rb") as source_file:
+            source_bytes = source_file.read()
+        source_hash = blake3(source_bytes).hexdigest()
+        cache_key = blake3(
+            _stable_json_bytes(
+                {
+                    "version": __version__,
+                    "source_hash": source_hash,
+                    "compiler": self._compiler,
+                    "options": self._default_options,
+                    "arch": self._device_arch,
+                }
+            )
+        ).hexdigest()
+        output_file = self._cache_dir / f"{name}-{cache_key}.so"
+        lock_file = output_file.with_suffix(output_file.suffix + ".lock")
+
+        with open(lock_file, "w") as lock_handle:
+            fcntl.flock(lock_handle, fcntl.LOCK_EX)
+            if not output_file.exists():
+                tmp_file = output_file.with_suffix(output_file.suffix + f".tmp.{os.getpid()}")
+                compile_command = [self._compiler] + self._default_options + ["-o", str(tmp_file), file]
+                try:
+                    subprocess.run(compile_command, check=True, capture_output=True, text=True)
+                    os.replace(tmp_file, output_file)
+                except FileNotFoundError as e:
+                    Path(tmp_file).unlink(missing_ok=True)
+                    raise RuntimeError(
+                        f"Compiler '{self._compiler}' not found. Make sure it's installed and in PATH."
+                    ) from e
+                except subprocess.CalledProcessError as e:
+                    Path(tmp_file).unlink(missing_ok=True)
+                    raise RuntimeError(
+                        f"Compilation failed with return code {e.returncode}.\n"
+                        f"Command: {' '.join(compile_command)}\n"
+                        f"Stdout: {e.stdout}\n"
+                        f"Stderr: {e.stderr}"
+                    ) from e
+        module_name = name
+        existing_module = sys.modules.get(module_name)
+        if existing_module and getattr(existing_module, "__mscclpp_cache_key__", None) == cache_key:
+            return existing_module
+
+        spec = importlib.util.spec_from_file_location(module_name, output_file)
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Could not load module '{name}' from '{output_file}'")
+        module = importlib.util.module_from_spec(spec)
+        module.__mscclpp_cache_key__ = cache_key
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        logging.debug(f"Successfully compiled and loaded module '{name}' from '{output_file}'")
+        return module
--- a/python/mscclpp/language/default_algos/init.py
+++ b/python/mscclpp/language/default_algos/init.py
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

-from mscclpp.language.default_algos.allreduce_2nodes import allreduce_2nodes
+from mscclpp.default_algos.allreduce_2nodes import allreduce_2nodes

 __all__ = ["allreduce_2nodes"]
--- a/python/mscclpp/language/default_algos/allreduce_2nodes.py
+++ b/python/mscclpp/language/default_algos/allreduce_2nodes.py
--- a/python/mscclpp/ext/init.py
+++ b/python/mscclpp/ext/init.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .algorithm_collection_builder import *
+
+__all__ = algorithm_collection_builder.__all__
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from __future__ import annotations
+from typing import Union
+from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
+import atexit
+
+from mscclpp._mscclpp import (
+    AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
+)
+
+__all__ = ["AlgorithmCollectionBuilder"]
+
+
+class AlgorithmCollectionBuilder:
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(AlgorithmCollectionBuilder, cls).__new__(cls)
+        return cls._instance
+
+    @classmethod
+    def reset(cls):
+        if cls._instance is not None:
+            _AlgorithmCollectionBuilder.reset()
+            cls._instance = None
+
+    def __init__(self):
+        if not hasattr(self, "_initialized"):
+            self._builder = _AlgorithmCollectionBuilder.get_instance()
+            self._initialized = True
+
+    def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
+        if isinstance(algorithm_builder, AlgorithmBuilder):
+            self._builder.add_algorithm_builder(algorithm_builder._algorithm_builder)
+            return
+        if isinstance(algorithm_builder, Algorithm):
+            if algorithm_builder.is_dsl_algorithm():
+                self._builder.add_dsl_algorithm_builder(algorithm_builder._algorithm)
+                return
+        raise ValueError("The 'algorithm_builder' argument must be an instance of AlgorithmBuilder or DSL Algorithm.")
+
+    def set_algorithm_selector(self, selector):
+        self._builder.set_algorithm_selector(selector)
+
+    def set_fallback_algorithm_selector(self, selector):
+        self._builder.set_fallback_algorithm_selector(selector)
+
+    def build(self) -> AlgorithmCollection:
+        collection = self._builder.build()
+        return AlgorithmCollection(collection)
+
+    def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
+        native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
+        return AlgorithmCollection(native_collection)
+
+
+atexit.register(AlgorithmCollectionBuilder.reset)
--- a/python/mscclpp/language/init.py
+++ b/python/mscclpp/language/init.py
@@ -2,3 +2,5 @@
 # Licensed under the MIT License.

 """MSCCL++ DSL."""
+
+from .utils import *
--- a/python/mscclpp/language/utils.py
+++ b/python/mscclpp/language/utils.py
@@ -5,6 +5,8 @@ from enum import Enum
 from dataclasses import dataclass, field
 from mscclpp.language.collectives import Collective

+__all__ = ["AlgoSpec", "ReplicationPolicy"]
+

 class ReplicationPolicy(Enum):
    interleaved = "interleaved"
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -6,11 +6,12 @@ import os
 import struct
 import subprocess
 import tempfile
-from typing import Any, Type, Union, Tuple
+from typing import Any, Type, Union

 import cupy as cp
 import numpy as np
-from ._mscclpp import RawGpuBuffer
+
+from mscclpp._mscclpp import DataType

 try:
    import torch
@@ -22,6 +23,22 @@ except ImportError:
    torchTensor = Type[Any]


+__all__ = [
+    "Kernel",
+    "KernelBuilder",
+    "pack",
+    "get_device_arch",
+    "torch_dtype_to_mscclpp_dtype",
+]
+
+
+def get_device_arch() -> str:
+    if cp.cuda.runtime.is_hip:
+        return cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8")
+    else:
+        return f"sm_{cp.cuda.Device().compute_capability}"
+
+
 class Kernel:
    CU_LAUNCH_PARAM_BUFFER_POINTER = 0x01
    CU_LAUNCH_PARAM_BUFFER_SIZE = 0x02
@@ -86,7 +103,8 @@ class KernelBuilder:
        mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp")
        include_dir = os.path.join(mscclpp_home, "include")
        if not cp.cuda.runtime.is_hip:
-            compute_capability = cp.cuda.Device().compute_capability
+            arch = get_device_arch()
+            compute_capability = arch.replace("sm_", "")
            cuda_home = os.environ.get("CUDA_HOME")
            nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
            command = [
@@ -104,9 +122,7 @@ class KernelBuilder:
            ]
        else:
            # the gcn arch name is like "gfx942:sramecc+:xnack-"
-            gcn_arch = (
-                cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8").split(":")[0]
-            )
+            gcn_arch = get_device_arch()
            rocm_home = os.environ.get("ROCM_HOME")
            hipcc = os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
            command = [
@@ -138,25 +154,6 @@ class KernelBuilder:
            self._tempdir.cleanup()


-class GpuBuffer(cp.ndarray):
-    def __new__(
-        cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
-    ):
-        # Check if `shape` is valid
-        if isinstance(shape, int):
-            shape = (shape,)
-        try:
-            shape = tuple(shape)
-        except TypeError:
-            raise ValueError("Shape must be a tuple-like or an integer.")
-        if any(s <= 0 for s in shape):
-            raise ValueError("Shape must be positive.")
-        # Create the buffer
-        buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
-        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
-        return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
-
-
 def pack(*args):
    res = b""
    for arg in list(args):
@@ -182,3 +179,18 @@ def pack(*args):

 def is_torch_tensor(tensor: Any) -> bool:
    return _use_torch and isinstance(tensor, torchTensor)
+
+
+def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
+    if not _use_torch:
+        raise RuntimeError("PyTorch is not available.")
+    if dtype == torch.float16:
+        return DataType.float16
+    elif dtype == torch.float32:
+        return DataType.float32
+    elif dtype == torch.int32:
+        return DataType.int32
+    elif dtype == torch.bfloat16:
+        return DataType.bfloat16
+    else:
+        raise ValueError(f"Unknown data type: {dtype}")
--- a/python/mscclpp_benchmark/init.py
+++ b/python/mscclpp_benchmark/init.py
@@ -1 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -13,9 +13,7 @@ from mscclpp_op import (
 from nccl_op import NcclAllReduce
 from mpi4py import MPI
 import cupy.cuda.nccl as nccl
-import mscclpp.comm as mscclpp_comm
-from mscclpp import ProxyService, is_nvls_supported
-from mscclpp.utils import GpuBuffer
+from mscclpp import ProxyService, is_nvls_supported, CommGroup, GpuBuffer
 from prettytable import PrettyTable
 import netifaces as ni
 import ipaddress
@@ -160,9 +158,7 @@ def find_best_config(mscclpp_call, niter):
    return best_config, best_time


-def run_benchmark(
-    mscclpp_group: mscclpp_comm.CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int
-):
+def run_benchmark(mscclpp_group: CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int):
    memory = GpuBuffer(nelem, dtype=data_type)
    memory_out = GpuBuffer(nelem, dtype=data_type)
    cp.cuda.runtime.deviceSynchronize()
@@ -259,9 +255,7 @@ if __name__ == "__main__":
    network_interface, my_ip = get_netinterface_info()
    root_ip = MPI.COMM_WORLD.bcast(my_ip, root=0)
    ifIpPortTrio = network_interface + ":" + root_ip + ":50000"  # some random port
-    mscclpp_group = mscclpp_comm.CommGroup(
-        interfaceIpPortTrio=ifIpPortTrio, rank=MPI.COMM_WORLD.rank, size=MPI.COMM_WORLD.size
-    )
+    mscclpp_group = CommGroup(interfaceIpPortTrio=ifIpPortTrio, rank=MPI.COMM_WORLD.rank, size=MPI.COMM_WORLD.size)

    # create a NcclComm
    if MPI.COMM_WORLD.rank == 0:
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -1,9 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 import os
 import cupy as cp
 import ctypes
 from mscclpp import Transport, ProxyService, MemoryDevice2DeviceSemaphore
-import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, GpuBuffer, pack
+from mscclpp import CommGroup, GpuBuffer
+from mscclpp.utils import KernelBuilder, pack

 IB_TRANSPORTS = [
    Transport.IB0,
@@ -31,7 +34,7 @@ def type_to_str(dtype):
 class MscclppAllReduce1:
    def __init__(
        self,
-        group: mscclpp_comm.CommGroup,
+        group: CommGroup,
        memory: cp.ndarray,
        read_only: int = 1,
        block_size: int = 1024,
@@ -97,7 +100,7 @@ class MscclppAllReduce1:
 class MscclppAllReduce2:
    def __init__(
        self,
-        group: mscclpp_comm.CommGroup,
+        group: CommGroup,
        memory: cp.ndarray,
        memory_out: cp.ndarray,
        block_size: int = 512,
@@ -164,7 +167,7 @@ class MscclppAllReduce2:
 class MscclppAllReduce3:
    def __init__(
        self,
-        group: mscclpp_comm.CommGroup,
+        group: CommGroup,
        memory: cp.ndarray,
        proxy_service: ProxyService,
        block_size: int = 1024,
@@ -234,7 +237,7 @@ class MscclppAllReduce3:
 class MscclppAllReduce4:
    def __init__(
        self,
-        group: mscclpp_comm.CommGroup,
+        group: CommGroup,
        memory: cp.ndarray,
        nranks_per_node: int,
        proxy_service: ProxyService,
@@ -335,7 +338,7 @@ class MscclppAllReduce4:
 class MscclppAllReduce5:
    def __init__(
        self,
-        group: mscclpp_comm.CommGroup,
+        group: CommGroup,
        memory: cp.ndarray,
        memory_out: cp.ndarray,
        nranks_per_node: int,
@@ -428,7 +431,7 @@ class MscclppAllReduce5:
 class MscclppAllReduce6:
    def __init__(
        self,
-        group: mscclpp_comm.CommGroup,
+        group: CommGroup,
        nelem: int,
        memory_dtype: cp.dtype,
        block_size: int = 1024,
--- a/python/mscclpp_benchmark/nccl_op.py
+++ b/python/mscclpp_benchmark/nccl_op.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 import cupy.cuda.nccl as nccl
 from mpi4py import MPI
 import cupy as cp
--- a/python/requirements_cuda11.txt
+++ b/python/requirements_cuda11.txt
@@ -6,4 +6,5 @@ pytest
 numpy
 matplotlib
 sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
-blake3
+blake3
+pybind11
--- a/python/requirements_cuda12.txt
+++ b/python/requirements_cuda12.txt
@@ -6,4 +6,5 @@ pytest
 numpy
 matplotlib
 sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
-blake3
+blake3
+pybind11
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -10,7 +10,7 @@ from mscclpp import (
    npkit,
    env,
 )
-import mscclpp.comm as mscclpp_comm
+from mscclpp import CommGroup, GpuBuffer
 from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 import os
 import struct
@@ -180,7 +180,7 @@ def main(
    n_iters: int = 10,
    n_graph_iters: int = 10,
 ):
-    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    mscclpp_group = CommGroup(MPI.COMM_WORLD)
    cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
    executor = Executor(mscclpp_group.communicator)
    npkit_dump_dir = env().npkit_dump_dir
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -13,7 +13,6 @@ import pytest

 from mscclpp import (
    ErrorCode,
-    Error,
    DataType,
    EndpointConfig,
    ExecutionPlan,
@@ -31,8 +30,8 @@ from mscclpp import (
    Device,
    DeviceType,
 )
-import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, GpuBuffer, pack
+from mscclpp import CommGroup, GpuBuffer
+from mscclpp.utils import KernelBuilder, pack
 from ._cpp import _ext
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group

@@ -75,7 +74,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
        # ranks are on different nodes
        pytest.skip("this case is not supported as localhost will be different for different nodes")

-    group = mscclpp_comm.CommGroup(mpi_group.comm, ifIpPortTrio)
+    group = CommGroup(mpi_group.comm, ifIpPortTrio)

    nelem = 1024
    memory = np.zeros(nelem, dtype=np.int32)
@@ -141,7 +140,7 @@ def test_bootstrap_init_gil_release(mpi_group: MpiGroup):
    mpi_group.comm.barrier()


-def create_connection(group: mscclpp_comm.CommGroup, connection_type: str):
+def create_connection(group: CommGroup, connection_type: str):
    if connection_type == "NVLS":
        all_ranks = list(range(group.nranks))
        tran = Transport.CudaIpc
@@ -163,7 +162,7 @@ def create_connection(group: mscclpp_comm.CommGroup, connection_type: str):
 def create_group_and_connection(mpi_group: MpiGroup, connection_type: str):
    if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
        pytest.skip("cannot use nvlink/nvls for cross node")
-    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    group = CommGroup(mpi_group.comm)
    try:
        connection = create_connection(group, connection_type)
    except Error as e:
@@ -282,7 +281,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str,

@parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores(mpi_group: MpiGroup):
-    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    group = CommGroup(mpi_group.comm)
    tran = group.my_ib_device(group.my_rank % 8)
    endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
    remote_nghrs = list(range(group.nranks))
@@ -302,7 +301,7 @@ def test_h2h_semaphores(mpi_group: MpiGroup):

@parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
-    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    group = CommGroup(mpi_group.comm)
    tran = group.my_ib_device(group.my_rank % 8)
    endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
    remote_nghrs = list(range(group.nranks))
@@ -339,7 +338,7 @@ def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
 def test_nvls_connection(mpi_group: MpiGroup):
    if all_ranks_on_the_same_node(mpi_group) is False:
        pytest.skip("cannot use nvls for cross node")
-    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    group = CommGroup(mpi_group.comm)
    all_ranks = list(range(group.nranks))
    nvls_connection = group.make_connection(all_ranks, Transport.CudaIpc, use_switch=True)
    memory1 = GpuBuffer(2**29, cp.int8)
@@ -659,7 +658,7 @@ def test_executor(mpi_group: MpiGroup, filename: str):
    if all_ranks_on_the_same_node(mpi_group) is False:
        pytest.skip("algo not support cross node")
    project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-    mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
+    mscclpp_group = CommGroup(mpi_group.comm)
    executor = Executor(mscclpp_group.communicator)
    npkit_dump_dir = env().npkit_dump_dir
    if npkit_dump_dir != "":
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,2 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu)
-target_sources(mscclpp_obj PRIVATE ${SOURCES})
-target_include_directories(mscclpp_obj PRIVATE include)
+add_subdirectory(core)
+add_subdirectory(ext)
--- a/src/algorithm.cc
+++ b/src/algorithm.cc
@@ -1,108 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <mscclpp/algorithm.hpp>
-
-namespace mscclpp {
-
-class Algorithm::Impl {
- public:
-  Impl(std::string name, std::string collective, Algorithm::InitFunc initFunc, Algorithm::KernelFunc kernelFunc,
-       Algorithm::ContextInitFunc contextInitFunc, Algorithm::ContextKeyGenFunc contextKeyGenFunc)
-      : name_(name),
-        collective_(collective),
-        initFunc_(initFunc),
-        kernelLaunchFunc_(kernelFunc),
-        contextInitFunc_(contextInitFunc),
-        contextKeyGenFunc_(contextKeyGenFunc) {}
-  int launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count, DataType dtype,
-             cudaStream_t stream, std::unordered_map<std::string, std::shared_ptr<void>>& extras);
-
-  std::string name_;
-  std::string collective_;
-  Algorithm::InitFunc initFunc_;
-  Algorithm::KernelFunc kernelLaunchFunc_;
-  Algorithm::ContextInitFunc contextInitFunc_;
-  Algorithm::ContextKeyGenFunc contextKeyGenFunc_;
-
-  bool initialized_ = false;
-  std::unordered_map<AlgorithmCtxKey, std::shared_ptr<AlgorithmCtx>> contexts_;
-};
-
-int Algorithm::Impl::launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-                            DataType dtype, cudaStream_t stream,
-                            std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  if (!initialized_) {
-    initFunc_(comm, extras);
-    initialized_ = true;
-  }
-  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, count, dtype);
-  auto it = contexts_.find(ctxKey);
-  if (it == contexts_.end()) {
-    auto ctx = contextInitFunc_(comm, input, output, count, dtype);
-    contexts_[ctxKey] = ctx;
-  }
-  return kernelLaunchFunc_(contexts_[ctxKey], input, output, count, dtype, stream, extras);
-}
-
-Algorithm::Algorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
-                     ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc)
-    : impl_(std::make_shared<Impl>(name, collective, initFunc, kernelFunc, contextInitFunc, contextKeyGenFunc)) {}
-
-int Algorithm::launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
-                      DataType dtype, cudaStream_t stream,
-                      std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
-  return this->impl_->launch(comm, input, output, count, dtype, stream, extras);
-}
-
-bool Algorithm::isEmpty() { return !impl_; }
-
-std::string Algorithm::name() const { return impl_->name_; }
-
-std::string Algorithm::collective() const { return impl_->collective_; }
-
-void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
-                                            Algorithm algorithm) {
-  this->algoMapByCollective_[collective][algoName] = algorithm;
-}
-
-Algorithm AlgorithmCollection::selectAlgorithm(const std::string& collective, const void* input, void* output,
-                                               size_t messageSize, DataType dtype, int nRanksPerNode, int worldSize) {
-  Algorithm algo;
-  if (algoSelector_) {
-    algo = algoSelector_(algoMapByCollective_, collective, input, output, messageSize, dtype, nRanksPerNode, worldSize);
-  }
-  if (algo.isEmpty()) {
-    algo = fallbackAlgoSelector_(algoMapByCollective_, collective, input, output, messageSize, dtype, nRanksPerNode,
-                                 worldSize);
-  }
-  return algo;
-}
-
-std::shared_ptr<AlgorithmCollectionBuilder> AlgorithmCollectionBuilder::getInstance() {
-  static std::shared_ptr<AlgorithmCollectionBuilder> instance(new AlgorithmCollectionBuilder());
-  return instance;
-}
-
-void AlgorithmCollectionBuilder::addAlgorithmBuilder(std::shared_ptr<AlgorithmBuilder> builder) {
-  this->algoBuilders_.push_back(builder);
-}
-
-void AlgorithmCollectionBuilder::setAlgorithmSelector(AlgoSelectFunc selector) { algoSelector_ = selector; }
-
-void AlgorithmCollectionBuilder::setFallbackAlgorithmSelector(AlgoSelectFunc selector) {
-  fallbackAlgoSelector_ = selector;
-}
-
-std::shared_ptr<AlgorithmCollection> AlgorithmCollectionBuilder::build() {
-  auto collection = std::make_shared<AlgorithmCollection>();
-  for (const auto& builder : algoBuilders_) {
-    auto algo = builder->build();
-    collection->registerAlgorithm(algo.collective(), algo.name(), algo);
-  }
-  collection->algoSelector_ = algoSelector_;
-  collection->fallbackAlgoSelector_ = fallbackAlgoSelector_;
-  return collection;
-}
-
-}  // namespace mscclpp
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu)
+
+# Handle ROCm specific source property
+if(MSCCLPP_USE_ROCM)
+    file(GLOB_RECURSE CU_SOURCES *.cu)
+    set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
+endif()
+
+
+add_library(mscclpp_obj OBJECT ${SOURCES})
+
+target_include_directories(mscclpp_obj
+    SYSTEM PRIVATE
+    ${GPU_INCLUDE_DIRS}
+    ${NUMA_INCLUDE_DIRS})
+
+target_include_directories(mscclpp_obj PRIVATE 
+    include
+    ${PROJECT_SOURCE_DIR}/include
+)
+
+target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl)
+
+if(MSCCLPP_USE_IB)
+    target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
+    target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
+    target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
+endif()
+
+set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
+
+if(MSCCLPP_USE_CUDA)
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_CUDA)
+elseif(MSCCLPP_USE_ROCM)
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_ROCM)
+    foreach(arch ${MSCCLPP_GPU_ARCHS})
+        target_compile_options(mscclpp_obj PRIVATE --offload-arch=${arch})
+    endforeach()
+endif()
+
+if(MSCCLPP_ENABLE_TRACE)
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_ENABLE_TRACE)
+endif()
+
+if(MSCCLPP_NPKIT_FLAGS)
+    target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
+endif()
+
+# libmscclpp
+add_library(mscclpp SHARED)
+target_link_libraries(mscclpp PUBLIC mscclpp_obj)
+set_target_properties(mscclpp PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
+
+add_library(mscclpp_static STATIC)
+target_link_libraries(mscclpp_static PUBLIC mscclpp_obj)
+set_target_properties(mscclpp_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
+
+install(TARGETS mscclpp
+    LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
+install(TARGETS mscclpp_static
+    ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -0,0 +1,201 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <filesystem>
+#include <mscclpp/algorithm.hpp>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+CollectiveBufferMode CollectiveRequest::bufferMode() const {
+  if (inputBuffer == outputBuffer) return CollectiveBufferMode::InPlace;
+  if (collective == "allgather") {
+    size_t rankOffset = rank * messageSize;
+    const char* expectedInput = static_cast<const char*>(outputBuffer) + rankOffset;
+    if (static_cast<const void*>(expectedInput) == inputBuffer) {
+      return CollectiveBufferMode::InPlace;
+    }
+    return CollectiveBufferMode::OutOfPlace;
+  }
+  return CollectiveBufferMode::OutOfPlace;
+}
+
+NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
+                                 ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc,
+                                 size_t minMessageSize, size_t maxMessageSize, CollectiveBufferMode bufferMode,
+                                 std::unordered_map<std::string, uint64_t> tags, Constraint constraint)
+    : name_(name),
+      collective_(collective),
+      initFunc_(initFunc),
+      kernelLaunchFunc_(kernelFunc),
+      contextInitFunc_(contextInitFunc),
+      contextKeyGenFunc_(contextKeyGenFunc),
+      minMessageSize_(minMessageSize),
+      maxMessageSize_(maxMessageSize),
+      bufferMode_(bufferMode),
+      tags_(tags),
+      constraint_(constraint) {}
+
+CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
+                                    size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
+                                    cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
+                                    const std::unordered_map<std::string, uintptr_t>& extras) {
+  if (!initialized_) {
+    initFunc_(comm);
+    initialized_ = true;
+  }
+  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype);
+  auto it = contexts_.find(ctxKey);
+  if (it == contexts_.end()) {
+    auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
+    contexts_[ctxKey] = ctx;
+  }
+  return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
+                           nThreadsPerBlock, extras);
+}
+
+const std::string& NativeAlgorithm::name() const { return name_; }
+
+const std::string& NativeAlgorithm::collective() const { return collective_; }
+
+const std::pair<size_t, size_t>& NativeAlgorithm::messageRange() const {
+  static std::pair<size_t, size_t> range;
+  range = {minMessageSize_, maxMessageSize_};
+  return range;
+}
+
+const std::unordered_map<std::string, uint64_t>& NativeAlgorithm::tags() const { return tags_; }
+
+const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferMode_; }
+
+Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
+
+void NativeAlgorithm::reset() {
+  contexts_.clear();
+  initialized_ = false;
+}
+
+void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
+                                            std::shared_ptr<Algorithm> algorithm) {
+  this->algoMapByCollective_[collective][algoName] = algorithm;
+}
+
+std::shared_ptr<Algorithm> AlgorithmCollection::selectAlgorithm(const CollectiveRequest& request) {
+  std::shared_ptr<Algorithm> algo;
+  if (!algoSelector_ && !fallbackAlgoSelector_) {
+    THROW(ALGO, Error, ErrorCode::InvalidUsage, "No algorithm selector is set in AlgorithmCollection.");
+  }
+  if (algoSelector_) {
+    algo = algoSelector_(algoMapByCollective_, request);
+  }
+  if (!algo) {
+    algo = fallbackAlgoSelector_(algoMapByCollective_, request);
+  }
+  return algo;
+}
+
+void AlgorithmCollection::extend(const AlgorithmCollection& other) {
+  for (const auto& [collective, algoMap] : other.algoMapByCollective_) {
+    for (const auto& [algoName, algorithm] : algoMap) {
+      this->registerAlgorithm(collective, algoName, algorithm);
+    }
+  }
+}
+
+void AlgorithmCollection::setSelectors(AlgoSelectFunc algoSelector, AlgoSelectFunc fallbackAlgoSelector) {
+  algoSelector_ = algoSelector;
+  fallbackAlgoSelector_ = fallbackAlgoSelector;
+}
+
+std::vector<std::shared_ptr<Algorithm>> AlgorithmCollection::getAllAlgorithms() const {
+  std::vector<std::shared_ptr<Algorithm>> allAlgos;
+  for (const auto& [collective, algoMap] : algoMapByCollective_) {
+    for (const auto& [algoName, algorithm] : algoMap) {
+      allAlgos.push_back(algorithm);
+    }
+  }
+  return allAlgos;
+}
+
+std::unordered_map<std::string, std::shared_ptr<Algorithm>> AlgorithmCollection::getAlgorithmsByCollective(
+    const std::string& collective) const {
+  auto it = algoMapByCollective_.find(collective);
+  if (it != algoMapByCollective_.end()) {
+    return it->second;
+  } else {
+    return {};
+  }
+}
+
+DslAlgorithm::DslAlgorithm(std::string id, ExecutionPlan plan, std::unordered_map<std::string, uint64_t> tags,
+                           Constraint constraint)
+    : plan_(plan), id_(id), tags_(tags), constraint_(constraint) {}
+
+const std::string& DslAlgorithm::name() const { return plan_.name(); }
+
+const std::string& DslAlgorithm::collective() const { return plan_.collective(); }
+
+const std::pair<size_t, size_t>& DslAlgorithm::messageRange() const {
+  static std::pair<size_t, size_t> range;
+  range = {plan_.minMessageSize(), plan_.maxMessageSize()};
+  return range;
+}
+
+const std::unordered_map<std::string, uint64_t>& DslAlgorithm::tags() const { return tags_; }
+
+const CollectiveBufferMode& DslAlgorithm::bufferMode() const {
+  // TODO: need to fix
+  static CollectiveBufferMode mode =
+      plan_.isInPlace() ? CollectiveBufferMode::InPlace : CollectiveBufferMode::OutOfPlace;
+  return mode;
+}
+
+Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
+
+CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+                                 size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
+                                 std::shared_ptr<Executor> executor, int, int,
+                                 const std::unordered_map<std::string, uintptr_t>&) {
+  if (!executor) {
+    THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
+  }
+  int rank = comm->bootstrap()->getRank();
+  switch (dtype) {
+    case DataType::FLOAT16:
+      executor->execute(rank, (half*)input, (half*)output, inputSize, outputSize, DataType::FLOAT16, plan_, stream);
+      break;
+    case DataType::FLOAT32:
+      executor->execute(rank, (float*)input, (float*)output, inputSize, outputSize, DataType::FLOAT32, plan_, stream);
+      break;
+    case DataType::BFLOAT16:
+      executor->execute(rank, (__bfloat16*)input, (__bfloat16*)output, inputSize, outputSize, DataType::BFLOAT16, plan_,
+                        stream);
+      break;
+#if defined(__FP8_TYPES_EXIST__)
+    case DataType::FP8_E4M3:
+      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FP8_E4M3, plan_,
+                        stream);
+      break;
+    case DataType::FP8_E5M2:
+      executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FP8_E5M2, plan_,
+                        stream);
+      break;
+#endif
+    case DataType::INT32:
+    case DataType::UINT32:
+      executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
+      break;
+    default:
+      WARN(ALGO, "Unsupported data type: ", static_cast<int>(dtype), " in DslAlgorithm");
+      return CommResult::CommInvalidArgument;
+  }
+  return CommResult::CommSuccess;
+}
+
+std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
+
+// TODO: implement this
+void DslAlgorithm::reset() {}
+
+}  // namespace mscclpp
--- a/src/core/bootstrap/bootstrap.cc
+++ b/src/core/bootstrap/bootstrap.cc
--- a/src/core/bootstrap/socket.cc
+++ b/src/core/bootstrap/socket.cc
--- a/src/core/c_style_remnants.cc
+++ b/src/core/c_style_remnants.cc
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
--- a/src/core/context.cc
+++ b/src/core/context.cc
--- a/src/core/core.cc
+++ b/src/core/core.cc
--- a/src/core/debug.cc
+++ b/src/core/debug.cc
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
--- a/src/core/errors.cc
+++ b/src/core/errors.cc
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -67,6 +67,10 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
      );
 #endif
      break;
+    case DataType::FP8_E4M3:
+    case DataType::FP8_E5M2:
+      // FP8 is not supported in CUDA execution kernel.
+      break;
  }
 }

--- a/src/core/executor/execution_plan.cc
+++ b/src/core/executor/execution_plan.cc
@@ -15,20 +15,6 @@

 namespace {

-static const std::vector<mscclpp::AlgoConfig> defaultAlgoConfigs = {
-    {"allreduce_2nodes_1K_64K.json", "allreduce", 8, 16, {{"default", 1}}},
-    {"allreduce_2nodes_128K_2M.json", "allreduce", 8, 16, {{"default", 1}}}};
-
-std::string simpleHash(const std::string& input) {
-  std::hash<std::string> hasher;
-  size_t hashValue = hasher(input);
-  std::ostringstream oss;
-  oss << std::hex << hashValue;
-  return oss.str();
-}
-
-std::string generateFileId(const std::string& filePath) { return simpleHash(filePath); }
-
 template <typename T, typename Predicate>
 std::vector<T> filter(const std::vector<T>& vec, Predicate pred) {
  std::vector<T> filtered;
@@ -699,9 +685,9 @@ void ExecutionPlan::Impl::operationsReset() { this->operations.clear(); }

 ExecutionPlan::ExecutionPlan(const std::string& planPath, int rank) : impl_(std::make_shared<Impl>(planPath, rank)) {}

-std::string ExecutionPlan::name() const { return this->impl_->name; }
+const std::string& ExecutionPlan::name() const { return this->impl_->name; }

-std::string ExecutionPlan::collective() const { return this->impl_->collective; }
+const std::string& ExecutionPlan::collective() const { return this->impl_->collective; }

 size_t ExecutionPlan::minMessageSize() const { return this->impl_->minMessageSize; }

@@ -709,149 +695,4 @@ size_t ExecutionPlan::maxMessageSize() const { return this->impl_->maxMessageSiz

 bool ExecutionPlan::isInPlace() const { return this->impl_->isInPlace; }

-void ExecutionPlanRegistry::Impl::setSelector(ExecutionPlanSelector selector) { selector_ = selector; }
-
-void ExecutionPlanRegistry::Impl::setDefaultSelector(ExecutionPlanSelector selector) { defaultSelector_ = selector; }
-
-std::shared_ptr<ExecutionPlanHandle> ExecutionPlanRegistry::Impl::select(const ExecutionRequest& request) {
-  std::vector<std::shared_ptr<ExecutionPlanHandle>> plans;
-  for (auto plan : planMap_[request.collective]) {
-    if (plan->match(request)) {
-      plans.push_back(plan);
-    }
-  }
-  if (selector_) {
-    auto plan = selector_(plans, request);
-    if (plan) {
-      return plan;
-    }
-  }
-  if (defaultSelector_) {
-    auto plan = defaultSelector_(plans, request);
-    if (plan) {
-      return plan;
-    }
-  }
-  INFO(MSCCLPP_EXECUTOR, "No suitable execution plan found for collective: %s", request.collective.c_str());
-  return nullptr;
-}
-
-void ExecutionPlanRegistry::Impl::registerPlan(const std::shared_ptr<ExecutionPlanHandle> planHandle) {
-  if (!planHandle) {
-    throw Error("Cannot register a null plan", ErrorCode::ExecutorError);
-  }
-  planMap_[planHandle->plan->collective()].push_back(planHandle);
-  idMap_[planHandle->id] = planHandle;
-}
-
-void ExecutionPlanRegistry::Impl::loadDefaultPlans(int rank) {
-  std::string planDir = mscclpp::env()->executionPlanDir;
-  if (!std::filesystem::exists(planDir)) {
-    INFO(MSCCLPP_EXECUTOR, "Plan directory does not exist: %s", planDir.c_str());
-    return;
-  }
-
-  for (const auto& config : defaultAlgoConfigs) {
-    std::string planPath = planDir + "/" + config.filename;
-    INFO(MSCCLPP_EXECUTOR, "Loading plan: %s", planPath.c_str());
-    if (!std::filesystem::exists(planPath)) {
-      INFO(MSCCLPP_EXECUTOR, "Plan file does not exist: %s", planPath.c_str());
-      continue;
-    }
-    std::string planId = generateFileId(planPath);
-    if (idMap_.find(planId) != idMap_.end()) {
-      INFO(MSCCLPP_EXECUTOR, "Plan already registered: %s", planId.c_str());
-      continue;
-    }
-    try {
-      auto executionPlan = std::make_shared<ExecutionPlan>(planPath, rank);
-      auto handle =
-          ExecutionPlanHandle::create(planId, config.worldSize, config.nRanksPerNode, executionPlan, config.tags);
-      registerPlan(handle);
-      INFO(MSCCLPP_EXECUTOR, "Successfully loaded plan: %s for collective: %s", planId.c_str(),
-           config.collective.c_str());
-    } catch (const std::exception& e) {
-      WARN("Failed to load plan %s: %s", planPath.c_str(), e.what());
-    }
-  }
-}
-
-std::shared_ptr<ExecutionPlanRegistry> ExecutionPlanRegistry::getInstance() {
-  static std::shared_ptr<ExecutionPlanRegistry> instance(new ExecutionPlanRegistry);
-  return instance;
-}
-
-void ExecutionPlanRegistry::registerPlan(const std::shared_ptr<ExecutionPlanHandle> planHandle) {
-  impl_->registerPlan(planHandle);
-}
-
-void ExecutionPlanRegistry::setSelector(ExecutionPlanSelector selector) { impl_->setSelector(selector); }
-
-void ExecutionPlanRegistry::setDefaultSelector(ExecutionPlanSelector selector) { impl_->setDefaultSelector(selector); }
-
-std::shared_ptr<ExecutionPlanHandle> ExecutionPlanRegistry::select(
-    const std::string& collective, int worldSize, int nRanksPerNode, int rank, const void* sendBuffer, void* recvBuffer,
-    size_t messageSize, const std::unordered_map<std::string, std::vector<uint64_t>>& hints) {
-  ExecutionRequest request{worldSize, nRanksPerNode, rank, sendBuffer, recvBuffer, messageSize, collective, hints};
-  return impl_->select(request);
-}
-
-std::vector<std::shared_ptr<ExecutionPlanHandle>> ExecutionPlanRegistry::getPlans(const std::string& collective) {
-  if (impl_->planMap_.find(collective) != impl_->planMap_.end()) {
-    return impl_->planMap_[collective];
-  }
-  return {};
-}
-
-std::shared_ptr<ExecutionPlanHandle> ExecutionPlanRegistry::get(const std::string& id) {
-  if (impl_->idMap_.find(id) != impl_->idMap_.end()) {
-    return impl_->idMap_[id];
-  }
-  return nullptr;
-}
-
-ExecutionPlanRegistry::ExecutionPlanRegistry() : impl_(std::make_unique<Impl>()) {}
-
-ExecutionPlanRegistry::~ExecutionPlanRegistry() = default;
-
-void ExecutionPlanRegistry::clear() {
-  impl_->planMap_.clear();
-  impl_->idMap_.clear();
-  impl_->selector_ = nullptr;
-  impl_->defaultSelector_ = nullptr;
-}
-
-void ExecutionPlanRegistry::loadDefaultPlans(int rank) { impl_->loadDefaultPlans(rank); }
-
-bool ExecutionRequest::isInPlace() const {
-  if (inputBuffer == outputBuffer) return true;
-  if (collective == "allgather") {
-    size_t rankOffset = rank * messageSize;
-    const char* expectedInput = static_cast<const char*>(outputBuffer) + rankOffset;
-    return static_cast<const void*>(expectedInput) == inputBuffer;
-  }
-  return false;
-}
-
-std::shared_ptr<ExecutionPlanHandle> ExecutionPlanHandle::create(
-    const std::string& id, int worldSize, int nRanksPerNode, std::shared_ptr<ExecutionPlan> plan,
-    const std::unordered_map<std::string, uint64_t>& tags) {
-  std::shared_ptr<ExecutionPlanHandle> handle(new ExecutionPlanHandle{id, {worldSize, nRanksPerNode}, plan, tags});
-  return handle;
-}
-
-bool ExecutionPlanHandle::match(const ExecutionRequest& request) {
-  bool worldSizeMatch = constraint.worldSize == request.worldSize;
-  bool ranksPerNodeMatch = constraint.nRanksPerNode == request.nRanksPerNode;
-  bool collectiveMatch = plan->collective() == request.collective;
-  bool inPlaceMatch = plan->isInPlace() == request.isInPlace();
-  size_t effectiveSize =
-      (request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
-  bool minSizeMatch = effectiveSize >= plan->minMessageSize();
-  bool maxSizeMatch = effectiveSize <= plan->maxMessageSize();
-
-  bool result = worldSizeMatch && ranksPerNodeMatch && collectiveMatch && inPlaceMatch && minSizeMatch && maxSizeMatch;
-  return result;
-}
-
 }  // namespace mscclpp
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
--- a/src/core/fifo.cc
+++ b/src/core/fifo.cc
--- a/src/core/gpu_ipc_mem.cc
+++ b/src/core/gpu_ipc_mem.cc
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -12,11 +12,11 @@ static inline bool isCudaTeardownError(cudaError_t err) {
  return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
 #else   // !defined(MSCCLPP_USE_ROCM)
  return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
-         err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure;
+         err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized;
 #endif  // !defined(MSCCLPP_USE_ROCM)
 }

-static inline bool isCuTeardownError(CUresult r) {
+[[maybe_unused]] static inline bool isCuTeardownError(CUresult r) {
  return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED;
 }

--- a/src/core/ib.cc
+++ b/src/core/ib.cc
--- a/src/core/ibverbs_wrapper.cc
+++ b/src/core/ibverbs_wrapper.cc
--- a/src/core/include/api.h
+++ b/src/core/include/api.h
--- a/src/core/include/atomic.hpp
+++ b/src/core/include/atomic.hpp
--- a/src/core/include/communicator.hpp
+++ b/src/core/include/communicator.hpp
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
--- a/src/core/include/debug.h
+++ b/src/core/include/debug.h
--- a/src/core/include/endpoint.hpp
+++ b/src/core/include/endpoint.hpp
--- a/src/core/include/execution_common.hpp
+++ b/src/core/include/execution_common.hpp
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
--- a/Show More
+++ b/Show More