Torch integration (#692)

Reorganize current native algorithm implementation and DSL algorithm
implementation.
Provide unified API for DSL algo and native algo and provide interface
to tune the algo
Provide interface for pytorch integration with native API and DSL

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
This commit is contained in:
Binyang Li
2026-01-21 20:32:24 -08:00
committed by GitHub
parent 78ce9fac8d
commit a707273701
156 changed files with 6107 additions and 4076 deletions

View File

@@ -94,7 +94,7 @@ jobs:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -106,7 +106,7 @@ jobs:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \

View File

@@ -74,13 +74,13 @@ steps:
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -101,9 +101,9 @@ steps:
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -124,15 +124,15 @@ steps:
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -152,10 +152,10 @@ steps:
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -177,7 +177,7 @@ steps:
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -200,7 +200,7 @@ steps:
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 -m pip install .; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
kill $CHILD_PID
@@ -223,9 +223,9 @@ steps:
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
./build/test/perf/fifo_test"'
./build/bin/perf/fifo_test"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -130,7 +130,7 @@ steps:
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
@@ -147,7 +147,7 @@ steps:
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
@@ -164,7 +164,7 @@ steps:
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -198,11 +198,12 @@ steps:
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -218,11 +219,12 @@ steps:
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -238,11 +240,12 @@ steps:
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
@@ -259,10 +262,10 @@ steps:
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2

View File

@@ -71,7 +71,7 @@ steps:
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"'
kill $CHILD_PID

View File

@@ -87,8 +87,8 @@ steps:
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
@@ -116,7 +116,7 @@ steps:
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \

View File

@@ -72,8 +72,8 @@ steps:
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
./build/test/unit_tests"'
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -94,10 +94,10 @@ steps:
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests"'
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -117,7 +117,7 @@ steps:
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID

View File

@@ -41,9 +41,9 @@ cd ..
For testing after successful build:
```bash
# To run all tests
mpirun -np 2 ./build/test/mp_unit_tests
mpirun -np 2 ./build/bin/mp_unit_tests
# To run tests excluding IB-related ones (when IB is not available)
mpirun -np 2 ./build/test/mp_unit_tests --gtest_filter=-*Ib*
mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
```
For building a Python package:

View File

@@ -36,32 +36,32 @@ jobs:
- name: Run mscclpp AllGather test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- name: Run mscclpp SendRecv test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- name: Run mscclpp AllToAll test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- name: Check collective primitives performance
run: |

View File

@@ -37,14 +37,14 @@ jobs:
- name: UnitTests
run: |
./build/test/unit_tests
./build/bin/unit_tests
- name: MpUnitTests
run: |
set -e
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- name: PyTests
run: |

1
.gitignore vendored
View File

@@ -6,3 +6,4 @@ __pycache__
.*.swp
.idea/
*.so
_codeql_detected_source_root

View File

@@ -16,6 +16,11 @@ endif()
set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")
# Set output directories for all targets
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
find_package(Git)
set(GIT_HASH "UNKNOWN")
if(Git_FOUND)
@@ -44,7 +49,8 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
option(MSCCLPP_BUILD_TESTS "Build tests" ON)
option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(MSCCLPP_BUILD_APPS_NCCL "Build NCCL interfaces" ON)
option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
@@ -164,54 +170,12 @@ include(FetchContent)
FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
FetchContent_MakeAvailable(json)
add_library(mscclpp_obj OBJECT)
target_include_directories(mscclpp_obj
SYSTEM PRIVATE
${GPU_INCLUDE_DIRS}
${NUMA_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl)
if(MSCCLPP_USE_IB)
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
endif()
set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
if(MSCCLPP_USE_CUDA)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_CUDA)
elseif(MSCCLPP_USE_ROCM)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_ROCM)
foreach(arch ${MSCCLPP_GPU_ARCHS})
target_compile_options(mscclpp_obj PRIVATE --offload-arch=${arch})
endforeach()
endif()
if(MSCCLPP_ENABLE_TRACE)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_ENABLE_TRACE)
endif()
if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()
# libmscclpp
add_library(mscclpp SHARED)
target_link_libraries(mscclpp PUBLIC mscclpp_obj)
set_target_properties(mscclpp PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
add_library(mscclpp_static STATIC)
target_link_libraries(mscclpp_static PUBLIC mscclpp_obj)
set_target_properties(mscclpp_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
add_subdirectory(include)
add_subdirectory(src)
if("${INSTALL_PREFIX}" STREQUAL "")
set(INSTALL_PREFIX "./")
endif()
install(TARGETS mscclpp_obj
FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include)
install(TARGETS mscclpp
LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
install(TARGETS mscclpp_static
ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)
add_subdirectory(src)
add_subdirectory(include)
# Tests
if(MSCCLPP_BUILD_TESTS)
@@ -223,8 +187,3 @@ endif()
if(MSCCLPP_BUILD_PYTHON_BINDINGS)
add_subdirectory(python)
endif()
# NCCL interfaces
if(MSCCLPP_BUILD_APPS_NCCL)
add_subdirectory(apps/nccl)
endif()

View File

@@ -1,51 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/*.cu)
file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/*.h)
file(GLOB_RECURSE AUDIT_SHIM CONFIGURE_DEPENDS audit-shim/*)
if(MSCCLPP_USE_ROCM)
set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CXX)
endif()
add_library(mscclpp_nccl_obj OBJECT)
target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES})
target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS})
target_include_directories(mscclpp_nccl_obj PRIVATE include ${PROJECT_SOURCE_DIR}/src/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} PUBLIC mscclpp_obj)
set_target_properties(mscclpp_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
if(MSCCLPP_USE_CUDA)
target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_CUDA)
elseif(MSCCLPP_USE_ROCM)
target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_ROCM)
foreach(arch ${MSCCLPP_GPU_ARCHS})
target_compile_options(mscclpp_nccl_obj PRIVATE --offload-arch=${arch})
endforeach()
endif()
if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_nccl_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()
add_library(mscclpp_nccl SHARED)
target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj)
set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
add_library(mscclpp_nccl_static STATIC)
target_link_libraries(mscclpp_nccl_static PUBLIC mscclpp_obj mscclpp_nccl_obj)
set_target_properties(mscclpp_nccl_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
add_library(mscclpp_audit_nccl_obj OBJECT)
set_target_properties(mscclpp_audit_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
target_sources(mscclpp_audit_nccl_obj PRIVATE ${AUDIT_SHIM})
add_library(mscclpp_audit_nccl SHARED)
target_link_libraries(mscclpp_audit_nccl PUBLIC mscclpp_audit_nccl_obj)
set_target_properties(mscclpp_audit_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
install(TARGETS mscclpp_nccl_obj
FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX})
install(TARGETS mscclpp_nccl
LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
install(TARGETS mscclpp_nccl_static
ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)
install(TARGETS mscclpp_audit_nccl
LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)

View File

@@ -1,220 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <mscclpp/nccl.h>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/env.hpp>
#include <mscclpp/gpu_utils.hpp>
#include "allgather.hpp"
#include "datatype_conversion.hpp"
#include "debug.h"
AllgatherAlgo6::AllgatherAlgo6() : disableChannelCache_(false) {
if (mscclpp::env()->disableChannelCache) {
disableChannelCache_ = true;
}
}
void AllgatherAlgo6::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>&) {
this->conns_ = setupConnections(comm);
this->memorySemaphores_ = std::move(setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_));
}
ncclResult_t AllgatherAlgo6::allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
void* output, size_t count, mscclpp::DataType dtype,
cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>&) {
int nBlocks = 28;
const size_t bytes = count * getDataTypeSize(dtype);
const size_t nElem = bytes / sizeof(int);
int rank = ctx->rank;
if (bytes <= 32 * (1 << 20)) {
if (nElem <= 4096) {
nBlocks = 7;
} else if (nElem <= 32768) {
nBlocks = 14;
} else if (nElem >= 2097152) {
nBlocks = 35;
}
} else {
nBlocks = 35;
}
size_t channelOutOffset = *static_cast<size_t*>(ctx->extras["channel_out_offset"].get());
if ((char*)input == (char*)output + rank * bytes) {
allgather6<false><<<nBlocks, 1024, 0, stream>>>((void*)input, ctx->memoryChannelDeviceHandles.get(),
channelOutOffset, ctx->rank, ctx->workSize, ctx->nRanksPerNode,
nElem);
} else {
allgather6<true><<<nBlocks, 1024, 0, stream>>>((void*)input, ctx->memoryChannelDeviceHandles.get(),
channelOutOffset, ctx->rank, ctx->workSize, ctx->nRanksPerNode,
nElem);
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
WARN("AllgatherAlgo6 failed with error %d", err);
return ncclInternalError;
}
return ncclSuccess;
}
std::shared_ptr<mscclpp::AlgorithmCtx> AllgatherAlgo6::initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
const void*, void* output, size_t count,
mscclpp::DataType dtype) {
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// setup semaphores
ctx->memorySemaphores = this->memorySemaphores_;
size_t bytes = count * getDataTypeSize(dtype);
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
if (disableChannelCache_) {
channelOutOffset = 0;
recvBytes = bytes;
recvBasePtr = (CUdeviceptr)output;
}
ctx->extras.insert({"channel_out_offset", std::make_shared<size_t>(channelOutOffset)});
// register the memory for the broadcast operation
mscclpp::RegisteredMemory localMemory =
comm->registerMemory((void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
ctx->memoryChannels = std::move(
setupMemoryChannels(this->conns_, ctx->memorySemaphores, remoteMemories, localMemory, nChannelsPerConnection_));
ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
// keep registered memories reference
ctx->registeredMemories = std::move(remoteMemories);
ctx->registeredMemories.push_back(localMemory);
return ctx;
}
mscclpp::AlgorithmCtxKey AllgatherAlgo6::generateAllgatherContextKey(const void*, void* output, size_t,
mscclpp::DataType) {
static int tag = 0;
if (disableChannelCache_) {
// always return a new key if channel cache is disabled
return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, tag++};
}
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
return mscclpp::AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
}
mscclpp::Algorithm AllgatherAlgo6::build() {
auto self = std::make_shared<AllgatherAlgo6>();
mscclpp::Algorithm allgatherAlgo(
"default_allgather6", "allgather",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allgatherKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateAllgatherContextKey(input, output, count, dtype);
});
return allgatherAlgo;
}
void AllgatherAlgo8::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
this->conns_ = setupConnections(comm);
this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
}
ncclResult_t AllgatherAlgo8::allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
void* output, size_t count, mscclpp::DataType dtype,
cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>&) {
int rank = ctx->rank;
const size_t bytes = count * getDataTypeSize(dtype);
const size_t nElem = bytes / sizeof(int);
if ((char*)input == (char*)output + rank * bytes) {
allgather8<false><<<56, 1024, 0, stream>>>((void*)input, this->scratchBuffer_.get(), (void*)output,
ctx->memoryChannelDeviceHandles.get(), rank, ctx->nRanksPerNode,
ctx->workSize, nElem);
} else {
allgather8<true><<<56, 1024, 0, stream>>>((void*)input, this->scratchBuffer_.get(), (void*)output,
ctx->memoryChannelDeviceHandles.get(), rank, ctx->nRanksPerNode,
ctx->workSize, nElem);
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
WARN("AllgatherAlgo8 failed with error %d", err);
return ncclInternalError;
}
return ncclSuccess;
}
std::shared_ptr<mscclpp::AlgorithmCtx> AllgatherAlgo8::initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
const void* input, void*, size_t count,
mscclpp::DataType dtype) {
constexpr int nChannelsPerConnection = 56;
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// setup semaphores
ctx->memorySemaphores = std::move(setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection));
size_t bytes = count * getDataTypeSize(dtype);
// register the memory for the broadcast operation
mscclpp::RegisteredMemory localMemory = comm->registerMemory((void*)input, bytes, mscclpp::Transport::CudaIpc);
mscclpp::RegisteredMemory scratchMemory =
comm->registerMemory(this->scratchBuffer_.get(), scratchBufferSize_, mscclpp::Transport::CudaIpc);
std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, scratchMemory);
// setup channels
ctx->memoryChannels = std::move(
setupMemoryChannels(this->conns_, ctx->memorySemaphores, remoteMemories, localMemory, nChannelsPerConnection));
ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
// keep registered memories reference
ctx->registeredMemories = std::move(remoteMemories);
ctx->registeredMemories.push_back(localMemory);
ctx->registeredMemories.push_back(scratchMemory);
return ctx;
}
mscclpp::AlgorithmCtxKey AllgatherAlgo8::generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType) {
// always return same key, non-zero copy algo
return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
mscclpp::Algorithm AllgatherAlgo8::build() {
auto self = std::make_shared<AllgatherAlgo8>();
mscclpp::Algorithm allgatherAlgo(
"default_allgather8", "allgather",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allgatherKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateAllgatherContextKey(input, output, count, dtype);
});
return allgatherAlgo;
}

View File

@@ -1,255 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#ifndef ALLGATHER_HPP_
#define ALLGATHER_HPP_
#include <mscclpp/nccl.h>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/concurrency_device.hpp>
#include <mscclpp/core.hpp>
#include <mscclpp/executor.hpp>
#include <mscclpp/gpu.hpp>
#include <mscclpp/memory_channel.hpp>
#include <mscclpp/memory_channel_device.hpp>
#include "common.hpp"
template <bool IsOutOfPlace>
__global__ void __launch_bounds__(1024, 1)
allgather6(void* sendbuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, size_t channelOutOffset,
size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t lid = tid % WARP_SIZE;
const size_t wid = tid / WARP_SIZE;
const size_t nThread = blockDim.x * gridDim.x;
const size_t nWarp = nThread / WARP_SIZE;
const size_t nPeer = nRanksPerNode - 1;
const size_t chanOffset = nPeer * blockIdx.x;
auto memChans = memoryChannels + chanOffset;
if (threadIdx.x < nPeer) {
memChans[threadIdx.x].relaxedSignal();
memChans[threadIdx.x].wait();
}
__syncthreads();
const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
const size_t bytes = bytesPerGPU * nPeer;
size_t unitBytesPerThread;
if (bytes >= nThread * 64) {
unitBytesPerThread = 64;
} else {
unitBytesPerThread = 16;
}
const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE;
const size_t unitBytes = unitBytesPerWarp * nWarp;
const size_t nLoop = bytes / unitBytes;
if (nLoop > 0) {
// First loop unrolling
const size_t peerIdx = wid % nPeer;
const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp;
if constexpr (IsOutOfPlace) {
char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
char* buff = reinterpret_cast<char*>(sendbuff);
const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp;
mscclpp::copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
WARP_SIZE);
mscclpp::copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
WARP_SIZE);
} else {
memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
}
}
for (size_t i = 1; i < nLoop; ++i) {
const size_t gWid = wid + i * nWarp;
const size_t peerIdx = gWid % nPeer;
const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp;
if constexpr (IsOutOfPlace) {
char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
char* buff = reinterpret_cast<char*>(sendbuff);
const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
mscclpp::copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
WARP_SIZE);
mscclpp::copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
WARP_SIZE);
} else {
memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
}
}
if (bytes % unitBytes > 0) {
const size_t gWid = wid + nLoop * nWarp;
const size_t peerIdx = gWid % nPeer;
const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
const size_t offset = bytesPerGPU * rank + offsetWithinRank;
const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU)
? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
: unitBytesPerWarp;
if (remainBytes > 0) {
if constexpr (IsOutOfPlace) {
char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
char* buff = reinterpret_cast<char*>(sendbuff);
mscclpp::copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE);
mscclpp::copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE);
} else {
memChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE);
}
}
}
deviceSyncer.sync(gridDim.x);
if (threadIdx.x < nPeer) {
memChans[threadIdx.x].signal();
memChans[threadIdx.x].wait();
}
}
template <bool IsOutOfPlace>
__global__ void __launch_bounds__(1024, 1)
allgather8(void* buff, void* scratch, void* resultBuff,
mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, int rank, int nRanksPerNode,
[[maybe_unused]] int worldSize, size_t nelems) {
const int nPeer = nRanksPerNode - 1;
const size_t chanOffset = nPeer * blockIdx.x;
// assume (nelems * sizeof(T)) is divisible by 16
const size_t nInt4 = nelems * sizeof(int) / sizeof(int4);
auto memoryChans = memoryChannels + chanOffset;
int4* buff4 = reinterpret_cast<int4*>(buff);
int4* scratch4 = reinterpret_cast<int4*>(scratch);
int4* resultBuff4 = reinterpret_cast<int4*>(resultBuff);
const size_t unitNInt4 = blockDim.x * gridDim.x; // The number of int4 transfers at once
const size_t nInt4PerChunk = unitNInt4 * 4; // 4 instructions per thread to make it more efficient
const size_t nItrs = nInt4 / nInt4PerChunk;
const size_t restNInt4 = nInt4 % nInt4PerChunk;
const size_t scratchChunkRankOffset = nInt4PerChunk * rank;
__shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[MAX_NRANKS_PER_NODE - 1];
const int lid = threadIdx.x % WARP_SIZE;
if (lid < nPeer) {
channels[lid] = memoryChans[lid];
}
__syncwarp();
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
// we can use double buffering to hide synchronization overhead
for (size_t itr = 0; itr < nItrs; itr++) {
if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
channels[threadIdx.x].signal();
channels[threadIdx.x].wait();
}
__syncthreads();
// Starts allgather
for (size_t idx = tid; idx < nInt4PerChunk; idx += blockDim.x * gridDim.x) {
int4 val = buff4[itr * nInt4PerChunk + idx];
for (int i = 0; i < nPeer; i++) {
const int peerIdx = (i + rank) % nPeer;
channels[peerIdx].write(idx + scratchChunkRankOffset, val);
}
if constexpr (IsOutOfPlace) {
resultBuff4[nInt4 * rank + idx + itr * nInt4PerChunk] = val;
}
}
// Ensure that all writes of this block have been issued before issuing the signal
__syncthreads();
if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
channels[threadIdx.x].signal();
channels[threadIdx.x].wait();
}
__syncthreads();
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
const int resultOffset = nInt4 * remoteRank + itr * nInt4PerChunk;
for (size_t idx = tid; idx < nInt4PerChunk; idx += blockDim.x * gridDim.x) {
int4 val = scratch4[nInt4PerChunk * remoteRank + idx];
resultBuff4[resultOffset + idx] = val;
}
}
}
if (restNInt4 > 0) {
if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
channels[threadIdx.x].signal();
channels[threadIdx.x].wait();
}
__syncthreads();
for (size_t idx = tid; idx < restNInt4; idx += blockDim.x * gridDim.x) {
int4 val = buff4[nItrs * nInt4PerChunk + idx];
for (int i = 0; i < nPeer; i++) {
const int peerIdx = (i + rank) % nPeer;
channels[peerIdx].write(idx + scratchChunkRankOffset, val);
}
if constexpr (IsOutOfPlace) {
resultBuff4[nInt4 * rank + idx + nItrs * nInt4PerChunk] = val;
}
}
// Ensure that all writes of this block have been issued before issuing the signal
__syncthreads();
if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
channels[threadIdx.x].signal();
channels[threadIdx.x].wait();
}
__syncthreads();
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
const int resultOffset = nInt4 * remoteRank + nItrs * nInt4PerChunk;
for (size_t idx = tid; idx < restNInt4; idx += blockDim.x * gridDim.x) {
int4 val = scratch4[nInt4PerChunk * remoteRank + idx];
resultBuff4[resultOffset + idx] = val;
}
}
}
}
class AllgatherAlgo6 : public mscclpp::AlgorithmBuilder {
public:
AllgatherAlgo6();
mscclpp::Algorithm build() override;
private:
bool disableChannelCache_;
std::vector<mscclpp::Connection> conns_;
std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores_;
const int nChannelsPerConnection_ = 35;
void initialize(std::shared_ptr<mscclpp::Communicator> comm, std::unordered_map<std::string, std::shared_ptr<void>>&);
ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
size_t count, mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras);
std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*,
void* output, size_t, mscclpp::DataType);
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
};
class AllgatherAlgo8 : public mscclpp::AlgorithmBuilder {
public:
mscclpp::Algorithm build() override;
private:
std::vector<mscclpp::Connection> conns_;
void initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras);
ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
size_t count, mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras);
std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*,
void* output, size_t, mscclpp::DataType);
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
size_t scratchBufferSize_;
std::shared_ptr<char> scratchBuffer_;
};
#endif // ALLGATHER_HPP_

View File

@@ -1,708 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <mscclpp/nccl.h>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/env.hpp>
#include <mscclpp/gpu.hpp>
#include <mscclpp/gpu_utils.hpp>
#include "allreduce.hpp"
#include "datatype_conversion.hpp"
#include "debug.h"
using AllreduceFunc =
std::function<cudaError_t(const void*, void*, void*, void*, void*, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
size_t, cudaStream_t, uint32_t*, uint32_t*, uint32_t*, uint32_t)>;
namespace {
template <Op OpType, typename T>
struct AllpairAdapter {
static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t channelInOffset, size_t,
size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t nelems,
cudaStream_t stream, uint32_t* deviceFlag7, uint32_t* deviceFlag28, uint32_t* deviceFlag56,
uint32_t numScratchBuff) {
using ChannelType = mscclpp::DeviceHandle<mscclpp::MemoryChannel>;
if (sizeof(T) * nelems < worldSize * sizeof(int)) {
int nBlocks = worldSize - 1;
int nThreadsPerBlock = 32;
allreduceAllPairs<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, deviceFlag7, numScratchBuff);
} else if (sizeof(T) * nelems <= (1 << 14)) {
int nBlocks = (worldSize - 1) * 4;
int nThreadsPerBlock = 512;
allreduceAllPairs<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, deviceFlag28, numScratchBuff);
} else if (sizeof(T) * nelems <= (1 << 20)) {
int nBlocks = (nRanksPerNode - 1) * 4;
int nThreadsPerBlock = 1024;
uint32_t* deviceFlag = deviceFlag28;
if (nelems >= 8192) {
nBlocks = (worldSize - 1) * 8;
nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
deviceFlag = deviceFlag56;
#if defined(__HIP_PLATFORM_AMD__)
size_t sizeBytes = sizeof(T) * nelems;
if constexpr (std::is_same_v<T, __half>) {
// Half-specific tuning for 32KB-256KB range
if (sizeBytes == (32 << 10)) {
nThreadsPerBlock = 64;
} else if (sizeBytes >= (64 << 10) && sizeBytes <= (256 << 10)) {
nThreadsPerBlock = 128;
}
}
#if defined(__FP8_TYPES_EXIST__)
// FP8-specific tuning for 32KB-256KB range
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
if (sizeBytes == (32 << 10)) {
nThreadsPerBlock = 64;
} else if (sizeBytes == (64 << 10)) {
nThreadsPerBlock = 128;
} else if (sizeBytes >= (128 << 10) && sizeBytes <= (256 << 10)) {
nThreadsPerBlock = 256;
}
}
#endif
#endif
}
#if defined(ENABLE_NPKIT)
size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
allreduce7<OpType><<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, deviceFlag, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
NpKit::GetCpuTimestamp());
#else
allreduce7<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, deviceFlag, numScratchBuff);
#endif
}
return cudaGetLastError();
}
};
template <Op OpType, typename T>
struct NvlsAdapter {
static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t nelems,
cudaStream_t stream, uint32_t*, uint32_t*, uint32_t*, uint32_t) {
#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
return cudaErrorNotSupported;
} else
#endif
{
using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
int nBlocks = nRanksPerNode;
int nThreadsPerBlock = 1024;
allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
nvlsOutChannels, channelInOffset, channelOutOffset,
nelems * sizeof(T), rank, nRanksPerNode);
return cudaGetLastError();
}
}
};
template <Op OpType, typename T>
struct NvlsWithCopyAdapter {
static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
int rank, int nRanksPerNode, int, size_t nelems, cudaStream_t stream, uint32_t*, uint32_t*,
uint32_t*, uint32_t) {
#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
return cudaErrorNotSupported;
} else
#endif
{
using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
if (sizeof(T) * nelems < (1 << 24)) {
int nBlocks = nRanksPerNode * 4;
int nThreadsPerBlock = 1024;
allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
nvlsChannels, nelems * sizeof(T), scratchBufferSize,
rank, nRanksPerNode);
} else {
int nBlocks = nRanksPerNode * 5;
int nThreadsPerBlock = 1024;
allreduce11<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
nvlsChannels, nelems * sizeof(T), scratchBufferSize,
rank, nRanksPerNode);
}
return cudaGetLastError();
}
}
};
template <Op OpType, typename T>
struct Allreduce8Adapter {
static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels,
void* memoryOutChannels, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t channelOutOffset, size_t,
int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream, uint32_t*,
uint32_t*, uint32_t*, uint32_t) {
using ChannelType = mscclpp::DeviceHandle<mscclpp::MemoryChannel>;
int nBlocks = (nRanksPerNode - 1) * 5;
int nThreadsPerBlock = 512;
allreduce8<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
channelOutOffset, 0, rank, nRanksPerNode, worldSize, nelems);
return cudaGetLastError();
}
};
template <Op OpType, typename T>
struct AllreduceNvlsPacketAdapter {
static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
int rank, int, int worldSize, size_t nelems, cudaStream_t stream, uint32_t* deviceFlag,
uint32_t*, uint32_t*, uint32_t) {
size_t size = nelems * sizeof(T);
int nBlocks = 8;
int nThreadsPerBlock = 1024;
if (size <= (1 << 13)) {
nBlocks = 4;
nThreadsPerBlock = 512;
}
allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(const T*)input, (T*)scratch, (T*)output, nvlsChannels, nelems, scratchBufferSize, rank, worldSize, deviceFlag);
return cudaGetLastError();
}
};
template <template <Op, typename> class Adapter>
AllreduceFunc dispatch(ncclRedOp_t op, mscclpp::DataType dtype) {
Op reduceOp = getReduceOp(op);
if (reduceOp == SUM) {
if (dtype == mscclpp::DataType::FLOAT16) {
return Adapter<SUM, half>::call;
} else if (dtype == mscclpp::DataType::FLOAT32) {
return Adapter<SUM, float>::call;
#if defined(__CUDA_BF16_TYPES_EXIST__)
} else if (dtype == mscclpp::DataType::BFLOAT16) {
return Adapter<SUM, __bfloat16>::call;
#endif
#if defined(__FP8_TYPES_EXIST__)
} else if (dtype == mscclpp::DataType::FP8_E4M3) {
return Adapter<SUM, __fp8_e4m3>::call;
} else if (dtype == mscclpp::DataType::FP8_E5M2) {
return Adapter<SUM, __fp8_e5m2>::call;
#endif
} else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
return Adapter<SUM, int>::call;
} else {
return nullptr;
}
} else if (reduceOp == MIN) {
if (dtype == mscclpp::DataType::FLOAT16) {
return Adapter<MIN, half>::call;
} else if (dtype == mscclpp::DataType::FLOAT32) {
return Adapter<MIN, float>::call;
#if defined(__CUDA_BF16_TYPES_EXIST__)
} else if (dtype == mscclpp::DataType::BFLOAT16) {
return Adapter<MIN, __bfloat16>::call;
#endif
#if defined(__FP8_TYPES_EXIST__)
} else if (dtype == mscclpp::DataType::FP8_E4M3) {
return Adapter<MIN, __fp8_e4m3>::call;
} else if (dtype == mscclpp::DataType::FP8_E5M2) {
return Adapter<MIN, __fp8_e5m2>::call;
#endif
} else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
return Adapter<MIN, int>::call;
} else {
return nullptr;
}
}
return nullptr;
}
} // namespace
enum Op getReduceOp(ncclRedOp_t op) {
switch (op) {
case ncclSum:
return SUM;
case ncclMin:
return MIN;
default:
WARN("op is invalid, op: %d", op);
throw mscclpp::Error("Invalid operation", mscclpp::ErrorCode::InternalError);
}
}
void AllreducePacket::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
deviceFlag7_ = mscclpp::detail::gpuCallocShared<uint32_t>(7);
deviceFlag28_ = mscclpp::detail::gpuCallocShared<uint32_t>(28);
deviceFlag56_ = mscclpp::detail::gpuCallocShared<uint32_t>(56);
std::vector<uint32_t> initFlag(56);
for (int i = 0; i < 56; ++i) {
initFlag[i] = 1;
}
mscclpp::gpuMemcpy<uint32_t>(deviceFlag7_.get(), initFlag.data(), 7, cudaMemcpyHostToDevice);
mscclpp::gpuMemcpy<uint32_t>(deviceFlag28_.get(), initFlag.data(), 28, cudaMemcpyHostToDevice);
mscclpp::gpuMemcpy<uint32_t>(deviceFlag56_.get(), initFlag.data(), 56, cudaMemcpyHostToDevice);
this->conns_ = setupConnections(comm);
}
ncclResult_t AllreducePacket::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
void* output, size_t count, mscclpp::DataType dtype,
cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
ncclRedOp_t op = *static_cast<ncclRedOp_t*>(extras.at("op").get());
size_t sendBytes;
CUdeviceptr sendBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
size_t channelInOffset = (char*)input - (char*)sendBasePtr;
AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
if (!allreduce) {
WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
return ncclInvalidArgument;
}
cudaError_t error = allreduce(input, this->scratchBuffer_.get(), output, ctx->memoryChannelDeviceHandles.get(),
nullptr, nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, ctx->rank,
ctx->nRanksPerNode, ctx->workSize, count, stream, deviceFlag7_.get(),
deviceFlag28_.get(), deviceFlag56_.get(), this->nSegmentsForScratchBuffer_);
if (error != cudaSuccess) {
WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
return ncclUnhandledCudaError;
}
return ncclSuccess;
}
std::shared_ptr<mscclpp::AlgorithmCtx> AllreducePacket::initAllreduceContext(
std::shared_ptr<mscclpp::Communicator> comm, const void* input, void*, size_t, mscclpp::DataType) {
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
const int nChannelsPerConnection = 56;
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
if (this->ctx_ == nullptr) {
// setup semaphores
ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
// setup registered memories
mscclpp::RegisteredMemory scratchMemory =
comm->registerMemory(this->scratchBuffer_.get(), this->scratchBufferSize_, mscclpp::Transport::CudaIpc);
std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, scratchMemory);
ctx->registeredMemories = std::move(remoteMemories);
ctx->registeredMemories.push_back(scratchMemory);
} else {
ctx->memorySemaphores = ctx_->memorySemaphores;
ctx->registeredMemories = ctx_->registeredMemories;
ctx->registeredMemories.pop_back(); // remove the local memory from previous context
}
size_t sendBytes;
CUdeviceptr sendBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
mscclpp::RegisteredMemory localMemory =
comm->registerMemory((void*)sendBasePtr, sendBytes, mscclpp::Transport::CudaIpc);
// setup channels
ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
nChannelsPerConnection);
ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
ctx->registeredMemories.emplace_back(localMemory);
this->ctx_ = ctx;
return ctx;
}
mscclpp::AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t,
mscclpp::DataType) {
size_t sendBytes;
CUdeviceptr sendBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
return mscclpp::AlgorithmCtxKey{(void*)sendBasePtr, nullptr, sendBytes, 0, 0};
}
mscclpp::Algorithm AllreducePacket::build() {
auto self = std::make_shared<AllreducePacket>();
mscclpp::Algorithm allreduceAlgo(
"default_allreduce_packet", "allreduce",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateAllreduceContextKey(input, output, count, dtype);
});
return allreduceAlgo;
}
void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>&) {
nSwitchChannels_ = 8;
this->conns_ = setupConnections(comm);
// setup semaphores
std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
setupMemorySemaphores(comm, this->conns_, nSwitchChannels_);
// setup base memory channels
this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nSwitchChannels_);
this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
}
ncclResult_t AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
void* output, size_t count, mscclpp::DataType dtype,
cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>&) {
AllreduceFunc allreduce = dispatch<NvlsAdapter>(ncclSum, dtype);
if (!allreduce) {
WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
return ncclInvalidArgument;
}
size_t sendBytes, recvBytes;
CUdeviceptr sendBasePtr, recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
size_t channelInOffset = (char*)input - (char*)sendBasePtr;
size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels = ctx->switchChannelDeviceHandles.get();
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels = ctx->switchChannelDeviceHandles.get();
if (input != output) {
nvlsOutChannels = nvlsOutChannels + nSwitchChannels_;
}
cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr,
nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank,
ctx->nRanksPerNode, ctx->workSize, count, stream, nullptr, nullptr, nullptr, 0);
if (error != cudaSuccess) {
WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
return ncclUnhandledCudaError;
}
return ncclSuccess;
}
mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
mscclpp::DataType) {
size_t sendBytes, recvBytes;
CUdeviceptr sendBasePtr, recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
return mscclpp::AlgorithmCtxKey{(void*)sendBasePtr, (void*)recvBasePtr, sendBytes, recvBytes, 0};
}
std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm,
const void* input, void* output, size_t,
mscclpp::DataType) {
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
size_t sendBytes, recvBytes;
CUdeviceptr sendBasePtr, recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
// setup channels
ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
ctx->switchChannels = setupNvlsChannels(ctx->nvlsConnections, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
if (input != output) {
auto nvlsOutConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
std::vector<mscclpp::SwitchChannel> outChannels =
setupNvlsChannels(nvlsOutConnections, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
ctx->nvlsConnections.insert(ctx->nvlsConnections.end(), nvlsOutConnections.begin(), nvlsOutConnections.end());
ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end());
}
ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
return ctx;
}
mscclpp::Algorithm AllreduceNvls::build() {
auto self = std::make_shared<AllreduceNvls>();
mscclpp::Algorithm allreduceAlgo(
"default_allreduce_nvls", "allreduce",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateAllreduceContextKey(input, output, count, dtype);
});
return allreduceAlgo;
}
void AllreduceNvlsWithCopy::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
nSwitchChannels_ = 8;
int nBaseChannels = 64;
scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
this->conns_ = setupConnections(comm);
// setup semaphores
std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
setupMemorySemaphores(comm, this->conns_, nBaseChannels);
// setup base memory channels
this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
}
ncclResult_t AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx,
const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>&) {
AllreduceFunc allreduce = dispatch<NvlsWithCopyAdapter>(ncclSum, dtype);
if (!allreduce) {
WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
return ncclInvalidArgument;
}
cudaError_t error =
allreduce(input, this->scratchBuffer_.get(), output, this->memoryChannelsDeviceHandle_.get(), nullptr,
ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, ctx->rank,
ctx->nRanksPerNode, ctx->workSize, count, stream, nullptr, nullptr, nullptr, 0);
if (error != cudaSuccess) {
WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
return ncclUnhandledCudaError;
}
return ncclSuccess;
}
mscclpp::AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t,
mscclpp::DataType) {
return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvlsWithCopy::initAllreduceContext(
std::shared_ptr<mscclpp::Communicator> comm, const void*, void*, size_t, mscclpp::DataType) {
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// setup channels
ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
ctx->switchChannels =
setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_.get(), scratchBufferSize_, nSwitchChannels_);
ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
return ctx;
}
mscclpp::Algorithm AllreduceNvlsWithCopy::build() {
auto self = std::make_shared<AllreduceNvlsWithCopy>();
mscclpp::Algorithm allreduceAlgo(
"default_allreduce_nvls_with_copy", "allreduce",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateAllreduceContextKey(input, output, count, dtype);
});
return allreduceAlgo;
}
void Allreduce8::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
this->conns_ = setupConnections(comm);
nChannelsPerConnection_ = 64;
comm_ = comm;
// setup semaphores
this->outputSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
this->inputScratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
mscclpp::RegisteredMemory localMemory =
comm->registerMemory(scratchBuffer_.get(), scratchBufferSize_, mscclpp::Transport::CudaIpc);
this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory);
localScratchMemory_ = std::move(localMemory);
}
ncclResult_t Allreduce8::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
void* output, size_t count, mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
const size_t bytes = count * getDataTypeSize(dtype);
ncclRedOp_t op = *static_cast<ncclRedOp_t*>(extras.at("op").get());
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
std::shared_ptr<mscclpp::MemoryChannel::DeviceHandle> inputChannelHandles;
if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
inputChannelHandles = this->memoryChannelsMap_[input].second;
} else {
mscclpp::RegisteredMemory localMemory =
comm_->registerMemory(const_cast<void*>(input), bytes, mscclpp::Transport::CudaIpc);
std::vector<mscclpp::MemoryChannel> channels =
setupMemoryChannels(this->conns_, this->inputScratchSemaphores_, this->remoteScratchMemories_, localMemory,
nChannelsPerConnection_);
this->memoryChannelsMap_[input] = std::make_pair(channels, setupMemoryChannelDeviceHandles(channels));
}
inputChannelHandles = this->memoryChannelsMap_[input].second;
AllreduceFunc allreduce = dispatch<Allreduce8Adapter>(op, dtype);
if (!allreduce) {
WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
return ncclInvalidArgument;
}
cudaError_t error =
allreduce(input, this->scratchBuffer_.get(), output, inputChannelHandles.get(),
ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank,
ctx->nRanksPerNode, ctx->workSize, count, stream, nullptr, nullptr, nullptr, 0);
if (error != cudaSuccess) {
WARN("Allreduce8 failed with error: %s", cudaGetErrorString(error));
return ncclUnhandledCudaError;
}
return ncclSuccess;
}
mscclpp::AlgorithmCtxKey Allreduce8::generateAllreduceContextKey(const void*, void* output, size_t, mscclpp::DataType) {
static int tag = 0;
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
if (mscclpp::env()->disableChannelCache) {
return mscclpp::AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++};
}
return mscclpp::AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
}
std::shared_ptr<mscclpp::AlgorithmCtx> Allreduce8::initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm,
const void*, void* output, size_t,
mscclpp::DataType) {
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// setup semaphores
ctx->memorySemaphores = this->outputSemaphores_;
// setup memories and channels
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
mscclpp::RegisteredMemory localMemory =
comm->registerMemory((void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
nChannelsPerConnection_);
ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
return ctx;
}
mscclpp::Algorithm Allreduce8::build() {
auto self = std::make_shared<Allreduce8>();
mscclpp::Algorithm allreduceAlgo(
"default_allreduce_allreduce8", "allreduce",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateAllreduceContextKey(input, output, count, dtype);
});
return allreduceAlgo;
}
void AllreduceNvlsPacket::initialize(std::shared_ptr<mscclpp::Communicator>,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
this->scratchBufferSize_ = *(size_t*)(extras.at("scratch_size").get());
deviceFlag_ = mscclpp::detail::gpuCallocShared<uint32_t>(16);
std::vector<uint32_t> initFlag(16);
for (int i = 0; i < 16; ++i) {
initFlag[i] = 1;
}
mscclpp::gpuMemcpy<uint32_t>(deviceFlag_.get(), initFlag.data(), 16, cudaMemcpyHostToDevice);
}
mscclpp::AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t,
mscclpp::DataType) {
return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvlsPacket::initAllreduceContext(
std::shared_ptr<mscclpp::Communicator> comm, const void*, void*, size_t, mscclpp::DataType) {
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// setup channels
int nSwitchChannels = 1;
ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
ctx->switchChannels =
setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_.get(), this->scratchBufferSize_, nSwitchChannels);
ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
return ctx;
}
ncclResult_t AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx,
const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extra) {
int op = *static_cast<int*>(extra.at("op").get());
AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(static_cast<ncclRedOp_t>(op), dtype);
if (!allreduce) {
WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
return ncclInvalidArgument;
}
cudaError_t error =
allreduce(input, this->scratchBuffer_.get(), output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(),
nullptr, 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, count, stream,
this->deviceFlag_.get(), nullptr, nullptr, 0);
if (error != cudaSuccess) {
WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
return ncclUnhandledCudaError;
}
return ncclSuccess;
}
mscclpp::Algorithm AllreduceNvlsPacket::build() {
auto self = std::make_shared<AllreduceNvlsPacket>();
mscclpp::Algorithm allreduceAlgo(
"default_allreduce_nvls_packet", "allreduce",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allreduceKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateAllreduceContextKey(input, output, count, dtype);
});
return allreduceAlgo;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,97 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <mscclpp/nccl.h>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/gpu_utils.hpp>
#include "broadcast.hpp"
#include "datatype_conversion.hpp"
void BroadcastAlgo6::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
this->conns_ = setupConnections(comm);
this->scratchBuffer_ = std::static_pointer_cast<char>(extras.at("scratch"));
this->scratchMemSize_ = *(size_t*)(extras.at("scratch_size").get());
}
ncclResult_t BroadcastAlgo6::broadcastKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
void* output, size_t count, mscclpp::DataType dtype,
cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
int root = *(int*)extras.at("root").get();
const size_t elemSize = getDataTypeSize(dtype);
cudaError_t err;
if (input == output) {
err = broadcast<false>((int*)input, (int*)this->scratchBuffer_.get(), (int*)output,
ctx->memoryChannelDeviceHandles.get(), 0, ctx->rank, ctx->nRanksPerNode, root, ctx->workSize,
count * elemSize / sizeof(int), stream);
} else {
err = broadcast<true>((int*)input, (int*)this->scratchBuffer_.get(), (int*)output,
ctx->memoryChannelDeviceHandles.get(), 0, ctx->rank, ctx->nRanksPerNode, root, ctx->workSize,
count * elemSize / sizeof(int), stream);
}
if (err != cudaSuccess) {
return ncclInternalError;
}
return ncclSuccess;
}
std::shared_ptr<mscclpp::AlgorithmCtx> BroadcastAlgo6::initBroadcastContext(std::shared_ptr<mscclpp::Communicator> comm,
const void*, void* output, size_t,
mscclpp::DataType) {
constexpr int nChannelsPerConnection = 8;
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// setup semaphores
ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
size_t recvBytes;
CUdeviceptr recvBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
// register the memory for the broadcast operation
mscclpp::RegisteredMemory localMemory =
comm->registerMemory((void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
mscclpp::RegisteredMemory localScratchMemory =
comm->registerMemory(this->scratchBuffer_.get(), scratchMemSize_, mscclpp::Transport::CudaIpc);
std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(comm, ctx->rank, localScratchMemory);
ctx->memoryChannels =
setupMemoryChannels(this->conns_, ctx->memorySemaphores, remoteMemories, localMemory, nChannelsPerConnection);
ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
// keep registered memories reference
ctx->registeredMemories = std::move(remoteMemories);
ctx->registeredMemories.push_back(localMemory);
ctx->registeredMemories.push_back(localScratchMemory);
return ctx;
}
mscclpp::AlgorithmCtxKey BroadcastAlgo6::generateBroadcastContextKey(const void*, void*, size_t, mscclpp::DataType) {
// always use same context
return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
}
mscclpp::Algorithm BroadcastAlgo6::build() {
auto self = std::make_shared<BroadcastAlgo6>();
mscclpp::Algorithm broadcastAlgo(
"default_broadcast6", "broadcast",
[self](std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) { self->initialize(comm, extras); },
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->broadcastKernelFunc(ctx, input, output, count, dtype, stream, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
mscclpp::DataType dtype) { return self->initBroadcastContext(comm, input, output, count, dtype); },
[self](const void* input, void* output, size_t count, mscclpp::DataType dtype) {
return self->generateBroadcastContextKey(input, output, count, dtype);
});
return broadcastAlgo;
}

View File

@@ -1,180 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#ifndef BROADCAST_HPP_
#define BROADCAST_HPP_
#include <mscclpp/nccl.h>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/concurrency_device.hpp>
#include <mscclpp/core.hpp>
#include <mscclpp/executor.hpp>
#include <mscclpp/gpu.hpp>
#include <mscclpp/memory_channel.hpp>
#include <mscclpp/memory_channel_device.hpp>
#include "common.hpp"
template <bool IsOutOfPlace>
__global__ void __launch_bounds__(1024, 1)
broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff,
mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, [[maybe_unused]] size_t channelOutOffset,
size_t rank, [[maybe_unused]] size_t worldSize, size_t root, size_t nRanksPerNode, size_t nelemsPerGPU) {
const size_t nThread = blockDim.x * gridDim.x;
const size_t nPeer = nRanksPerNode - 1;
const size_t chanOffset = nPeer * blockIdx.x;
__shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> memChans[MAX_NRANKS_PER_NODE - 1];
if (threadIdx.x < nPeer) {
memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x];
memChans[threadIdx.x].relaxedSignal();
memChans[threadIdx.x].wait();
}
__syncthreads();
const size_t peerRootIdx = (root == rank) ? nPeer : ((root < rank) ? root : (root - 1));
const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
const size_t bytes = bytesPerGPU;
size_t unitBytesPerThread;
if (bytes * nPeer >= nThread * 64) {
unitBytesPerThread = 64;
} else {
unitBytesPerThread = 16;
}
const size_t unitBytesPerBlock = unitBytesPerThread * blockDim.x;
const size_t unitBytes = unitBytesPerBlock * gridDim.x;
const size_t nLoop = bytes / unitBytes;
const size_t maxScratchSizeToUse = (SCRATCH_SIZE - unitBytes);
const size_t nLoopToSync = (maxScratchSizeToUse / unitBytes) + 1;
size_t scratchSub = 0;
// First loop will always fit the scratch size.
if (nLoop > 0) {
// First loop unrolling
const size_t offset = blockIdx.x * unitBytesPerBlock;
if (rank == root) {
char* send_ = reinterpret_cast<char*>(sendbuff);
for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_); // Peer's scratchbuff.
mscclpp::copy<16, false>(dst + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
__syncthreads();
if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
}
if constexpr (IsOutOfPlace) {
char* recv_ = reinterpret_cast<char*>(recvbuff);
mscclpp::copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
}
} else { // rank != root.
if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
__syncthreads();
char* recv_ = reinterpret_cast<char*>(recvbuff);
char* scratch_ = reinterpret_cast<char*>(scratchbuff); // My scratchbuff.
mscclpp::copy<16, false>(recv_ + offset, scratch_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
}
}
for (size_t i = 1; i < nLoop; ++i) {
const size_t offset = blockIdx.x * unitBytesPerBlock + i * unitBytes;
if (i % nLoopToSync == 0) { // Sync to reuse scratch buff
scratchSub = -i * unitBytes;
deviceSyncer.sync(gridDim.x);
if (threadIdx.x < nPeer) {
memChans[threadIdx.x].relaxedSignal();
memChans[threadIdx.x].wait();
}
}
if (rank == root) {
char* send_ = reinterpret_cast<char*>(sendbuff);
for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_); // Peer's scratchbuff.
mscclpp::copy<16, false>(dst + offset + scratchSub, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
__syncthreads();
if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
}
if constexpr (IsOutOfPlace) {
char* recv_ = reinterpret_cast<char*>(recvbuff);
mscclpp::copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
}
} else { // rank != root.
if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
__syncthreads();
char* recv_ = reinterpret_cast<char*>(recvbuff);
char* scratch_ = reinterpret_cast<char*>(scratchbuff); // My scratchbuff.
mscclpp::copy<16, false>(recv_ + offset, scratch_ + offset + scratchSub, unitBytesPerBlock, threadIdx.x,
blockDim.x);
}
}
// Remainder loop will also fit the scratch buff since we subtract unitBytes from SCRATCH_SIZE.
if (bytes % unitBytes > 0) { // remainder.
const size_t offset = blockIdx.x * unitBytesPerBlock + nLoop * unitBytes;
const size_t remainBytes = (offset < bytes) ? (bytes - offset) : 0;
if (remainBytes > 0) {
if (rank == root) {
char* send_ = reinterpret_cast<char*>(sendbuff);
for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_); // Peer's scratchbuff.
mscclpp::copy<16, true>(dst + offset + scratchSub, send_ + offset, remainBytes, threadIdx.x, blockDim.x);
__syncthreads();
if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
}
if constexpr (IsOutOfPlace) {
char* recv_ = reinterpret_cast<char*>(recvbuff);
mscclpp::copy<16, true>(recv_ + offset, send_ + offset, remainBytes, threadIdx.x, blockDim.x);
}
} else { // rank != root.
if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
__syncthreads();
char* recv_ = reinterpret_cast<char*>(recvbuff);
char* scratch_ = reinterpret_cast<char*>(scratchbuff); // My scratchbuff.
mscclpp::copy<16, true>(recv_ + offset, scratch_ + offset + scratchSub, remainBytes, threadIdx.x, blockDim.x);
}
} // remainBytes > 0.
}
deviceSyncer.sync(gridDim.x);
if (threadIdx.x < nPeer) {
memChans[threadIdx.x].relaxedSignal();
memChans[threadIdx.x].wait();
}
}
template <bool IsOutOfPlace, typename T>
cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
size_t channelOutOffset, int rank, int nRanksPerNode, int root, int worldSize, size_t nelems,
cudaStream_t stream) {
int nBlocks = 7;
broadcast6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, (void*)scratch, (void*)resultBuff, memoryChannels,
channelOutOffset, rank, worldSize, root, nRanksPerNode,
nelems * sizeof(T) / sizeof(int));
return cudaGetLastError();
}
class BroadcastAlgo6 : public mscclpp::AlgorithmBuilder {
public:
BroadcastAlgo6() = default;
mscclpp::Algorithm build() override;
private:
void initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>& extras);
ncclResult_t broadcastKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
size_t count, mscclpp::DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras);
std::shared_ptr<mscclpp::AlgorithmCtx> initBroadcastContext(std::shared_ptr<mscclpp::Communicator> comm, const void*,
void* output, size_t, mscclpp::DataType);
mscclpp::AlgorithmCtxKey generateBroadcastContextKey(const void*, void*, size_t, mscclpp::DataType);
std::vector<mscclpp::Connection> conns_;
size_t scratchMemSize_;
std::shared_ptr<char> scratchBuffer_;
};
#endif // BROADCAST_HPP_

View File

@@ -1,67 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#ifndef NCCL_COMMON_HPP_
#define NCCL_COMMON_HPP_
#include <mscclpp/concurrency_device.hpp>
#include <mscclpp/core.hpp>
#include <mscclpp/env.hpp>
#include <mscclpp/memory_channel.hpp>
#include <mscclpp/switch_channel.hpp>
#include <vector>
#if defined(__HIP_PLATFORM_AMD__)
#define WARP_SIZE 64
#define __syncwarp() __builtin_amdgcn_wave_barrier()
#else
#define WARP_SIZE 32
#endif
constexpr int NUM_NVLS_CONNECTION = 8;
constexpr int NUM_SEMAPHORES = 64;
constexpr int MAX_NRANKS_PER_NODE = 8;
constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
static bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache;
__device__ mscclpp::DeviceSyncer deviceSyncer;
__constant__ mscclpp::DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES];
std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
mscclpp::RegisteredMemory localMemory);
std::vector<mscclpp::MemoryChannel> setupMemoryChannels(
const std::vector<mscclpp::Connection>& connections,
const std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>& memorySemaphores,
const std::vector<mscclpp::RegisteredMemory>& remoteMemories, mscclpp::RegisteredMemory localMemory,
int nChannelsPerConnection);
std::vector<mscclpp::Connection> setupConnections(std::shared_ptr<mscclpp::Communicator> comm);
std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> setupMemorySemaphores(
std::shared_ptr<mscclpp::Communicator> comm, const std::vector<mscclpp::Connection>& connections,
int nChannelsPerConnection);
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> setupMemoryChannelDeviceHandles(
const std::vector<mscclpp::MemoryChannel>& memoryChannels);
std::vector<std::shared_ptr<mscclpp::NvlsConnection>> setupNvlsConnections(std::shared_ptr<mscclpp::Communicator> comm,
size_t size, int numConnections);
std::vector<mscclpp::SwitchChannel> setupNvlsChannels(std::vector<std::shared_ptr<mscclpp::NvlsConnection>> conns,
void* buffer, size_t bufferSize, int nSwitchChannels);
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SwitchChannel>> setupNvlsChannelDeviceHandles(
const std::vector<mscclpp::SwitchChannel>& nvlsChannels);
std::vector<mscclpp::BaseMemoryChannel> setupBaseMemoryChannels(
const std::vector<mscclpp::Connection>& connections,
const std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>& memorySemaphores,
int nChannelsPerConnection);
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>> setupBaseMemoryChannelDeviceHandles(
const std::vector<mscclpp::BaseMemoryChannel>& baseMemoryChannels);
#endif // NCCL_COMMON_HPP_

View File

@@ -1,3 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:

View File

@@ -102,7 +102,7 @@ Key file: `dsl_with_nccl_api.py`.
#### Launch with interposition
To run with NCCL interposition, you preload the MSCCL++ shim so it transparently intercepts NCCL calls made by PyTorchs nccl backend.
```bash
LD_PRELOAD=<MSCCLPP_REPO>/build/apps/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
```
## Notices:
- When using NCCL interposition, the algorithm selection order is:

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

View File

@@ -19,7 +19,7 @@ $ make -j allgather_test_perf allreduce_test_perf
For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu).
```bash
$ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5
$ mpirun --bind-to numa -np 8 ./bin/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5
```
*NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.*
@@ -27,7 +27,7 @@ $ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 4
Check the help message for more details.
```bash
$ ./test/mscclpp-test/allreduce_test_perf --help
$ ./bin/allreduce_test_perf --help
USAGE: allreduce_test_perf
[-b,--minbytes <min size in bytes>]
[-e,--maxbytes <max size in bytes>]

View File

@@ -0,0 +1,470 @@
# MSCCL++ Torch Integration Guide
This guide shows how to use MSCCL++ with PyTorch for high-performance collective communication.
## Quick Start
MSCCL++ provides three ways to implement collective algorithms with PyTorch:
| Approach | Best For | Complexity |
|----------|----------|------------|
| **Default Algorithms** | Quick integration, standard use cases | Low |
| **DSL-based** | Custom communication patterns without C++ | Medium |
| **Native C++/CUDA** | Maximum control, custom kernels | High |
```{figure} ../figs/customize_algo.png
:name: MSCCL++ Customization Algorithm Selection
:alt: MSCCL++ Customization Algorithm Selection
:align: center
:width: 400px
MSCCL++ Customization Algorithm Selection Overview
```
## Prerequisites
Before starting, ensure you have:
- MSCCL++ installed with Python bindings
- PyTorch with CUDA support
**Required Environment Variables:**
```bash
export MSCCLPP_MASTER_ADDR=<master_node_ip> # IP address of master node
export MSCCLPP_MASTER_PORT=<port> # Port for communication (e.g., 29500)
```
## Common Setup: Creating a Communicator
All approaches require initializing an MSCCL++ communicator. Here's a reusable setup:
```python
import os
import torch
import netifaces as ni
import ipaddress
def get_network_interface(ip: str):
"""Find the network interface for the given IP address."""
target = ipaddress.ip_address(ip)
for interface in ni.interfaces():
addresses = ni.ifaddresses(interface)
if ni.AF_INET in addresses:
for link in addresses[ni.AF_INET]:
if "addr" in link:
if ipaddress.ip_address(link["addr"]) == target:
return interface
return None
def init_communicator():
"""Initialize MSCCL++ communicator from environment variables."""
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ.get("LOCAL_RANK", rank))
torch.cuda.set_device(local_rank)
master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
master_port = os.environ["MSCCLPP_MASTER_PORT"]
interface = get_network_interface(master_addr)
if interface is None:
raise ValueError(f"Cannot find network interface for IP {master_addr}")
interface_trio = f"{interface}:{master_addr}:{master_port}"
comm_group = mscclpp.CommGroup(
interfaceIpPortTrio=interface_trio,
rank=rank,
size=world_size
)
return comm_group
```
---
## Approach 1: Default Built-in Algorithms (Easiest)
Use pre-built, optimized algorithms from MSCCL++. Best for standard collective operations.
**Example:** [customized_comm_with_default_algo.py](../../examples/torch-integration/customized_comm_with_default_algo.py)
### Step 1: Load Default Algorithms
```python
import mscclpp
import mscclpp.utils as mscclpp_utils
def load_algorithms(scratch_buffer: torch.Tensor, rank: int):
"""Load MSCCL++ default algorithm collection."""
collection_builder = mscclpp.AlgorithmCollectionBuilder()
return collection_builder.build_default_algorithms(
scratch_buffer=scratch_buffer.data_ptr(),
scratch_buffer_size=scratch_buffer.nbytes,
rank=rank
)
```
### Step 2: Create a Custom Communicator Class
```python
class CustomizedComm:
def __init__(self, comm: mscclpp.CommGroup):
self.comm = comm
# Allocate scratch buffer (required by some algorithms)
dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
# Load and select algorithms
algorithms = load_algorithms(self.scratch_buffer, comm.my_rank)
# Select specific algorithms by name
self._algo_small = [
algo for algo in algorithms
if algo.collective == "allreduce"
and algo.name == "default_allreduce_nvls_packet"
][0]
self._algo_large = [
algo for algo in algorithms
if algo.collective == "allreduce"
and algo.name == "default_allreduce_nvls_with_copy"
][0]
def all_reduce(self, tensor: torch.Tensor, stream=None):
# Select algorithm based on message size
algo = self._algo_small if tensor.nbytes < (1 << 20) else self._algo_large
algo.execute(
comm=self.comm.communicator,
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=tensor.nbytes,
output_size=tensor.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
op=mscclpp.ReduceOp.SUM,
stream=stream.cuda_stream if stream else 0
)
```
### Step 3: Run
```bash
MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_default_algo.py
```
---
## Approach 2: DSL-based Algorithms (Medium)
Define custom communication patterns using MSCCL++ Python DSL. No C++ required.
**Example:** [customized_comm_with_dsl.py](../../examples/torch-integration/customized_comm_with_dsl.py)
### Step 1: Define the Collective Program
```python
import mscclpp
from mscclpp.language.collectives import AllReduce
from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
from mscclpp.language.program import CollectiveProgram
from mscclpp.language.rank import Rank
def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
"""Define an allreduce using NVLS (NVLink SHARP)."""
gpu_size = spec.world_size
with CollectiveProgram(
spec.name,
spec.collective,
gpu_size,
instances=spec.instances,
protocol=spec.protocol,
num_threads_per_block=spec.num_threads_per_block,
min_message_size=spec.min_message_size,
max_message_size=spec.max_message_size,
) as program:
# Create NVLS channel for all GPUs
nvls_chan = SwitchChannel(
rank_list=[gpu for gpu in range(gpu_size)],
buffer_type=BufferType.input
)
# Create memory channels for synchronization
channels = {}
for gpu in range(gpu_size):
for peer in range(gpu_size):
if peer != gpu:
channels[(peer, gpu)] = MemoryChannel(peer, gpu)
# Synchronize before operation
for gpu in range(gpu_size):
for peer in range(gpu_size):
if peer != gpu:
channels[(peer, gpu)].signal(tb=0, relaxed=True)
for peer in range(gpu_size):
if peer != gpu:
channels[(peer, gpu)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
# Perform reduce and broadcast
for gpu in range(gpu_size):
rank = Rank(gpu)
input_buffer = rank.get_input_buffer()
nvls_chan.at_rank(gpu).reduce(
buffer_offset=gpu, size=1,
dst_chunk=input_buffer[gpu:gpu+1], tb=0
)
nvls_chan.at_rank(gpu).broadcast(
src_chunk=input_buffer[gpu:gpu+1],
buffer_offset=gpu, size=1, tb=0
)
# Synchronize after operation
for gpu in range(gpu_size):
for peer in range(gpu_size):
if peer != gpu:
channels[(peer, gpu)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
for peer in range(gpu_size):
if peer != gpu:
channels[(peer, gpu)].wait(tb=0, relaxed=True)
return program
```
### Step 2: Compile the Algorithm
```python
def setup_algorithm(rank: int, world_size: int, nranks_per_node: int):
"""Compile the DSL algorithm for this rank."""
spec = mscclpp.language.AlgoSpec(
name="allreduce_nvls",
collective=AllReduce(world_size, 1, True),
nranks_per_node=nranks_per_node,
world_size=world_size,
in_place=True,
instances=nranks_per_node,
protocol="Simple",
num_threads_per_block=1024,
min_message_size=1 << 20,
max_message_size=48 << 30,
tags={"nvls": 1},
)
return mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
```
### Step 3: Execute with Executor
DSL algorithms require an `Executor`:
```python
class CustomizedComm:
def __init__(self, comm: mscclpp.CommGroup, algorithm):
self.comm = comm
self.executor = mscclpp.Executor(comm.communicator) # Required for DSL
self.algorithm = algorithm
def all_reduce(self, tensor: torch.Tensor, stream=None):
self.algorithm.execute(
comm=self.comm.communicator,
executor=self.executor, # Pass executor for DSL algorithms
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=tensor.nbytes,
output_size=tensor.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
stream=stream.cuda_stream if stream else 0
)
```
### Step 4: Run
```bash
MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_dsl.py
```
---
## Approach 3: Native C++/CUDA Kernels (Advanced)
Write custom CUDA kernels for maximum performance and control.
**Example:** [customized_allgather.py](../../examples/torch-integration/customized_allgather.py) + [customized_allgather.cu](../../examples/torch-integration/customized_allgather.cu)
### Step 1: Implement the CUDA Kernel and Algorithm Builder
Create a `.cu` file with your kernel and algorithm builder:
```cpp
// customized_allgather.cu
#include <mscclpp/algorithm.hpp>
#include <mscclpp/core.hpp>
#include <pybind11/pybind11.h>
namespace py = pybind11;
// Your CUDA kernel
__global__ void allgather(
mscclpp::DeviceHandle<mscclpp::PortChannel>* channels,
int rank,
size_t nbytesPerGPU
) {
// Kernel implementation...
}
// Algorithm builder class
class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
public:
std::shared_ptr<mscclpp::Algorithm> build() override {
auto self = std::make_shared<AllgatherAlgoBuilder>();
return std::make_shared<mscclpp::NativeAlgorithm>(
"allgather", // Algorithm name
"allgather", // Collective type
// Initialize function
[self](std::shared_ptr<mscclpp::Communicator> comm) {
self->initialize(comm);
},
// Kernel execution function
[self](const std::shared_ptr<void> ctx,
const void* input, void* output,
size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, mscclpp::ReduceOp op,
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>& extras) {
return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
},
// Context initialization function
[self](std::shared_ptr<mscclpp::Communicator> comm,
const void* input, void* output,
size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->initContext(comm, input, output, inputSize, dtype);
},
// Context key generation function
[self](const void* input, void* output,
size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->generateContextKey(input, output, inputSize, outputSize, dtype);
}
);
}
private:
void initialize(std::shared_ptr<mscclpp::Communicator> comm) { /* ... */ }
mscclpp::CommResult kernelFunc(const std::shared_ptr<void> ctx, /* ... */) { /* ... */ }
std::shared_ptr<void> initContext(/* ... */) { /* ... */ }
mscclpp::AlgorithmCtxKey generateContextKey(/* ... */) { /* ... */ }
};
// Expose to Python
PYBIND11_MODULE(mscclpp_native, m) {
m.def("create_allgather_algorithm", []() {
auto builder = std::make_shared<AllgatherAlgoBuilder>();
auto algo = builder->build();
// Return as PyCapsule (see full example for capsule handling)
return py::reinterpret_steal<py::capsule>(getCapsule(algo));
});
}
```
### Step 2: Compile and Load in Python
```python
import mscclpp
import os
# MSCCL++ compiles the .cu file at runtime using JIT
mscclpp_native = mscclpp.compile_native(
name="mscclpp_native",
file=os.path.join(os.path.dirname(__file__), "customized_allgather.cu")
)
# Get the algorithm from the compiled module
capsule = mscclpp_native.create_allgather_algorithm()
algorithm = mscclpp.Algorithm.create_from_native_capsule(capsule)
```
### Step 3: Execute
```python
class CustomizedComm:
def __init__(self, comm: mscclpp.CommGroup):
self.comm = comm
# Compile and load native algorithm
mscclpp_native = mscclpp.compile_native(
name="mscclpp_native",
file="customized_allgather.cu"
)
capsule = mscclpp_native.create_allgather_algorithm()
self.algorithm = mscclpp.Algorithm.create_from_native_capsule(capsule)
def all_gather(self, tensor: torch.Tensor, out_tensor: torch.Tensor, stream=None):
self.algorithm.execute(
self.comm.communicator,
tensor.data_ptr(),
out_tensor.data_ptr(),
tensor.nbytes,
out_tensor.nbytes,
mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
stream=stream.cuda_stream if stream else 0
)
```
### Step 4: Run
```bash
MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
torchrun --nnodes=1 --nproc_per_node=8 customized_allgather.py
```
---
## Algorithm.execute() API Reference
All algorithms use the same `execute()` interface:
```python
algorithm.execute(
comm, # mscclpp.Communicator (required)
input_buffer, # int: input data pointer from tensor.data_ptr()
output_buffer, # int: output data pointer
input_size, # int: input size in bytes (tensor.nbytes)
output_size, # int: output size in bytes
dtype, # mscclpp.DataType: data type
op=mscclpp.ReduceOp.NOP, # Reduction operation (for reduce collectives)
stream=0, # CUDA stream handle
executor=None, # mscclpp.Executor (required for DSL algorithms)
nblocks=0, # Thread blocks (0 = auto)
nthreads_per_block=0, # Threads per block (0 = auto)
extras=None # dict[str, int]: extra pointer parameters
)
```
**Data Type Conversion:**
```python
import mscclpp.utils as mscclpp_utils
# Convert PyTorch dtype to MSCCL++ dtype
mscclpp_dtype = mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype)
```
**Getting CUDA Stream:**
```python
stream_handle = torch.cuda.current_stream().cuda_stream
```
---
## Summary
| Approach | When to Use | Key Requirement |
|----------|-------------|-----------------|
| Default | Standard operations, quick setup | Scratch buffer |
| DSL | Custom patterns, no C++ needed | Executor |
| Native | Custom kernels, max performance | `.cu` file with pybind11 |
All examples are in [`examples/torch-integration/`](../../examples/torch-integration/).

View File

@@ -12,3 +12,5 @@ This section provides advanced topics and best practices for using MSCCL++. It i
guide/advanced-connections
guide/cpp-examples
guide/customized-algorithm-with-nccl-api
guide/mscclpp-dsl-integration
guide/mscclpp-torch-integration

View File

@@ -132,20 +132,20 @@ For more details on how to use the Dev Container, see the [Dev Containers tutori
```bash
$ make -j unit_tests
$ ./test/unit_tests
$ ./bin/unit_tests
```
For thorough testing of MSCCL++ features, we need to use `mp_unit_tests` that require at least two GPUs on the system. `mp_unit_tests` also requires MPI to be installed on the system. For example, the following commands compile and run `mp_unit_tests` with two processes (two GPUs). The number of GPUs can be changed by changing the number of processes.
```bash
$ make -j mp_unit_tests
$ mpirun -np 2 ./test/mp_unit_tests
$ mpirun -np 2 ./bin/mp_unit_tests
```
To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_port` argument that is accessible from all nodes. For example:
```bash
$ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000
$ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0.0.5:50000
```
## Performance Benchmark
@@ -166,12 +166,13 @@ $ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) APIs using MSCCL++. How to use:
1. [Build MSCCL++ from source](#install-from-source).
2. Replace your `libnccl.so` library with `libmscclpp_nccl.so`, which is compiled under `./build/apps/nccl/` directory.
2. Replace your `libnccl.so` library with `libmscclpp_nccl.so`, which is compiled under `./build/lib/` directory.
For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
```bash
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
If MSCCL++ is built on AMD platforms, `libmscclpp_nccl.so` would replace the [RCCL](https://github.com/ROCm/rccl) library (i.e., `librccl.so`).
@@ -188,12 +189,14 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
```bash
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
```bash
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features.

View File

@@ -1,6 +1,7 @@
sphinx==8.2.3
breathe==4.36.0
myst_parser==4.0.1
pybind11
sphinx_rtd_theme
sphinxcontrib-mermaid
sphinx-autodoc-typehints

View File

@@ -15,9 +15,9 @@ from mscclpp import (
Transport,
)
from mscclpp.utils import GpuBuffer
import mscclpp.comm as mscclpp_comm
def create_connection(group: mscclpp_comm.CommGroup, transport: str):
def create_connection(group: mscclpp.CommGroup, transport: str):
remote_nghrs = list(range(group.nranks))
remote_nghrs.remove(group.my_rank)
if transport == "NVLink":
@@ -30,7 +30,7 @@ def create_connection(group: mscclpp_comm.CommGroup, transport: str):
return connections
if __name__ == "__main__":
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
mscclpp_group = mscclpp.CommGroup(MPI.COMM_WORLD)
connections = create_connection(mscclpp_group, "NVLink")
nelems = 1024
memory = GpuBuffer(nelem, dtype=cp.int32)

View File

@@ -16,7 +16,7 @@ SRC = customized_allgather.cu
all: $(TARGET)
$(TARGET): $(SRC)
$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp -lnccl
$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp_collectives -lmscclpp -lnccl
clean:
rm -f $(TARGET)

View File

@@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <mscclpp/nccl.h>
#include <mscclpp/ext/nccl/nccl.h>
#include <sys/wait.h>
#include <filesystem>
@@ -10,6 +10,7 @@
#include <memory>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/core.hpp>
#include <mscclpp/ext/collectives/algorithm_collection_builder.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <unordered_map>
@@ -76,6 +77,15 @@ __global__ void __launch_bounds__(1024)
}
}
struct Context {
int rank;
int workSize;
int nRanksPerNode;
std::vector<mscclpp::RegisteredMemory> registeredMemories;
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannelDeviceHandles;
};
class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
public:
AllgatherAlgoBuilder() = default;
@@ -85,23 +95,21 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
}
}
mscclpp::Algorithm build() {
std::shared_ptr<mscclpp::Algorithm> build() override {
auto self = std::make_shared<AllgatherAlgoBuilder>();
mscclpp::Algorithm allgatherAlgo(
"allgather", "allgather",
[self](std::shared_ptr<mscclpp::Communicator> comm, std::unordered_map<std::string, std::shared_ptr<void>>&) {
self->initialize(comm);
std::shared_ptr<mscclpp::Algorithm> allgatherAlgo = std::make_shared<mscclpp::NativeAlgorithm>(
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
},
[self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t count,
int dtype, cudaStream_t stream, std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return self->allgatherKernelFunc(ctx, input, output, count, static_cast<ncclDataType_t>(dtype), stream,
extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count, int dtype) {
return self->initAllgatherContext(comm, input, output, count, static_cast<ncclDataType_t>(dtype));
},
[self](const void* input, void* output, size_t count, int dtype) {
return self->generateAllgatherContextKey(input, output, count, static_cast<ncclDataType_t>(dtype));
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
static_cast<ncclDataType_t>(dtype));
});
return allgatherAlgo;
}
@@ -126,34 +134,32 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
proxyService_->startProxy();
}
ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
size_t count, [[maybe_unused]] ncclDataType_t dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
int rank = ctx->rank;
int worldSize = ctx->workSize;
mscclpp::CommResult allgatherKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
size_t inputSize, cudaStream_t stream) {
auto algoCtx = std::static_pointer_cast<Context>(ctx);
int rank = algoCtx->rank;
int worldSize = algoCtx->workSize;
int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
allgather<<<1, nThreadsPerBlock, 0, stream>>>(ctx->portChannelDeviceHandles.get(), rank,
count * ncclTypeSize(dtype));
allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputSize);
if (cudaGetLastError() == cudaSuccess) {
return ncclSuccess;
return mscclpp::CommResult::CommSuccess;
}
return ncclInternalError;
return mscclpp::CommResult::CommInternalError;
}
std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
const void* input, void* output, size_t count,
ncclDataType_t dtype) {
auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void* input,
void* output, size_t inputSize, mscclpp::DataType dtype) {
auto ctx = std::make_shared<Context>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// register memories
mscclpp::RegisteredMemory inputBufRegMem =
comm->registerMemory((void*)input, count * ncclTypeSize(dtype), mscclpp::Transport::CudaIpc);
comm->registerMemory((void*)input, inputSize, mscclpp::Transport::CudaIpc);
mscclpp::RegisteredMemory outputBufRegMem =
comm->registerMemory(output, count * ncclTypeSize(dtype) * ctx->workSize, mscclpp::Transport::CudaIpc);
comm->registerMemory(output, inputSize * ctx->workSize, mscclpp::Transport::CudaIpc);
std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
for (int i = 0; i < ctx->workSize; i++) {
if (i == ctx->rank) continue;
@@ -184,9 +190,9 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
return ctx;
}
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t count,
ncclDataType_t dtype) {
return {(void*)input, output, count * ncclTypeSize(dtype), count * ncclTypeSize(dtype) * worldSize_, 0};
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
size_t outputSize, ncclDataType_t dtype) {
return {(void*)input, output, inputSize, outputSize, 0};
}
};
@@ -197,16 +203,16 @@ void worker(int rank, int worldSize, ncclUniqueId id) {
// register algorithm
auto allgatherAlgoBuilder = std::make_shared<AllgatherAlgoBuilder>();
mscclpp::AlgorithmCollectionBuilder::getInstance()->addAlgorithmBuilder(allgatherAlgoBuilder);
mscclpp::AlgorithmCollectionBuilder::getInstance()->setAlgorithmSelector(
[](const std::unordered_map<std::string, std::unordered_map<std::string, mscclpp::Algorithm>>&
auto algoCollectionBuilder = mscclpp::collective::AlgorithmCollectionBuilder::getInstance();
algoCollectionBuilder->addAlgorithmBuilder(allgatherAlgoBuilder);
algoCollectionBuilder->setAlgorithmSelector(
[](const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<mscclpp::Algorithm>>>&
algoMapByCollective,
std::string collective, const void* input, void* output, size_t messageSize, int dtype, int nRanksPerNode,
int worldSize) {
if (collective != "allgather") {
return mscclpp::Algorithm();
const mscclpp::CollectiveRequest& request) -> std::shared_ptr<mscclpp::Algorithm> {
if (request.collective != "allgather") {
return nullptr;
}
return algoMapByCollective.at(collective).at("allgather");
return algoMapByCollective.at(request.collective).at("allgather");
});
float *sendbuff, *recvbuff;
@@ -259,6 +265,7 @@ void worker(int rank, int worldSize, ncclUniqueId id) {
MSCCLPP_CUDATHROW(cudaFree(recvbuff));
ncclCommDestroy(comm);
algoCollectionBuilder->reset();
}
int main() {

View File

@@ -0,0 +1,199 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include <Python.h>
#include <mscclpp/ext/nccl/nccl.h>
#include <pybind11/pybind11.h>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/core.hpp>
#include <mscclpp/port_channel.hpp>
#include <mscclpp/port_channel_device.hpp>
namespace py = pybind11;
#if defined(__HIP_PLATFORM_AMD__)
#define WARP_SIZE 64
#else
#define WARP_SIZE 32
#endif
__global__ void __launch_bounds__(1024)
allgather(mscclpp::DeviceHandle<mscclpp::PortChannel>* portChannels, int rank, size_t nbytesPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;
// Each warp is responsible for one of the remote ranks
mscclpp::DeviceHandle<mscclpp::PortChannel> portChan = portChannels[warpId];
// this allgather is really simple and implemented as an alltoall
// this thread's role is a sender role
// put your data asynchronously
if (threadIdx.x % WARP_SIZE == 0) {
portChan.putWithSignal(rank * nbytesPerGPU, 0, nbytesPerGPU);
}
// make sure everyone is put their data before some thread randomly blocks everyone else in signal
__syncthreads();
// push with flag and sync to make sure the data is received
if (threadIdx.x % WARP_SIZE == 0) {
portChan.flush();
}
// this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
if (threadIdx.x % WARP_SIZE == 0) {
portChan.wait();
}
}
struct Context {
int rank;
int workSize;
int nRanksPerNode;
std::vector<mscclpp::RegisteredMemory> registeredMemories;
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannelDeviceHandles;
};
class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
public:
AllgatherAlgoBuilder() = default;
~AllgatherAlgoBuilder() {
if (proxyService_) {
proxyService_->stopProxy();
}
}
std::shared_ptr<mscclpp::Algorithm> build() override {
auto self = std::make_shared<AllgatherAlgoBuilder>();
std::shared_ptr<mscclpp::Algorithm> allgatherAlgo = std::make_shared<mscclpp::NativeAlgorithm>(
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
});
return allgatherAlgo;
}
private:
std::vector<mscclpp::Connection> conns_;
std::shared_ptr<mscclpp::ProxyService> proxyService_;
int worldSize_;
void initialize(std::shared_ptr<mscclpp::Communicator> comm) {
std::vector<std::shared_future<mscclpp::Connection>> connectionFutures;
worldSize_ = comm->bootstrap()->getNranks();
for (int i = 0; i < worldSize_; i++) {
if (i == comm->bootstrap()->getRank()) continue;
connectionFutures.push_back(comm->connect(mscclpp::Transport::CudaIpc, i));
}
std::vector<mscclpp::Connection> connections;
std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
[](const auto& future) { return future.get(); });
this->conns_ = std::move(connections);
proxyService_ = std::make_shared<mscclpp::ProxyService>();
proxyService_->startProxy(true);
}
mscclpp::CommResult allgatherKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
size_t inputBytes, [[maybe_unused]] mscclpp::DataType dtype,
cudaStream_t stream) {
auto algoCtx = std::static_pointer_cast<Context>(ctx);
int rank = algoCtx->rank;
int worldSize = algoCtx->workSize;
int nThreadsPerBlock = (worldSize - 1) * WARP_SIZE;
allgather<<<1, nThreadsPerBlock, 0, stream>>>(algoCtx->portChannelDeviceHandles.get(), rank, inputBytes);
if (cudaGetLastError() == cudaSuccess) {
return mscclpp::CommResult::CommSuccess;
}
return mscclpp::CommResult::CommInternalError;
}
std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void* input,
void* output, size_t inputBytes, mscclpp::DataType dtype) {
auto ctx = std::make_shared<Context>();
ctx->rank = comm->bootstrap()->getRank();
ctx->workSize = comm->bootstrap()->getNranks();
ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
// register memories
mscclpp::RegisteredMemory inputBufRegMem =
comm->registerMemory((void*)input, inputBytes, mscclpp::Transport::CudaIpc);
mscclpp::RegisteredMemory outputBufRegMem =
comm->registerMemory(output, inputBytes * ctx->workSize, mscclpp::Transport::CudaIpc);
std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteRegMemories;
for (int i = 0; i < ctx->workSize; i++) {
if (i == ctx->rank) continue;
comm->sendMemory(outputBufRegMem, i, 0);
remoteRegMemories.push_back(comm->recvMemory(i, 0));
}
// setup channels
std::vector<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannels;
mscclpp::MemoryId inputMemoryId = this->proxyService_->addMemory(inputBufRegMem);
for (int i = 0; i < this->conns_.size(); i++) {
auto remoteMemory = remoteRegMemories[i].get();
mscclpp::MemoryId remoteMemoryId = this->proxyService_->addMemory(remoteMemory);
portChannels.push_back(mscclpp::deviceHandle(this->proxyService_->portChannel(
this->proxyService_->buildAndAddSemaphore(*comm, this->conns_[i]), remoteMemoryId, inputMemoryId)));
}
ctx->portChannelDeviceHandles =
mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::PortChannel>>(portChannels.size());
mscclpp::gpuMemcpy(ctx->portChannelDeviceHandles.get(), portChannels.data(), portChannels.size(),
cudaMemcpyHostToDevice);
// keep registered memory references
std::transform(remoteRegMemories.begin(), remoteRegMemories.end(), std::back_inserter(ctx->registeredMemories),
[](const auto& fut) { return fut.get(); });
ctx->registeredMemories.push_back(inputBufRegMem);
ctx->registeredMemories.push_back(outputBufRegMem);
return ctx;
}
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
size_t outputSize, mscclpp::DataType dtype) {
return {(void*)input, output, inputSize, outputSize, 0};
}
};
std::shared_ptr<mscclpp::Algorithm> createAllgatherAlgorithm() {
auto allgatherAlgoBuilder = std::make_shared<AllgatherAlgoBuilder>();
return allgatherAlgoBuilder->build();
}
void deletePtr(PyObject* capsule) {
const char* name = PyCapsule_GetName(capsule);
void* p = PyCapsule_GetPointer(capsule, name);
if (p == nullptr) {
PyErr_WriteUnraisable(capsule);
return;
}
auto* ptr = static_cast<std::shared_ptr<mscclpp::Algorithm>*>(p);
delete ptr;
}
PyObject* getCapsule(std::shared_ptr<mscclpp::Algorithm> algo) {
auto* ptrCopy = new std::shared_ptr<mscclpp::Algorithm>(algo);
PyObject* capsule = PyCapsule_New(ptrCopy, mscclpp::ALGORITHM_NATIVE_CAPSULE_NAME, deletePtr);
if (capsule == nullptr) {
delete ptrCopy;
throw pybind11::error_already_set();
}
return capsule;
}
PYBIND11_MODULE(mscclpp_native, m) {
m.doc() = "A simple C++ extension for mscclpp customized algorithm";
m.def(
"create_allgather_algorithm",
[]() { return py::reinterpret_steal<py::capsule>(getCapsule(createAllgatherAlgorithm())); },
"Create an allgather algorithm and return it as a PyCapsule usable by MSCCL++ Python bindings");
}

View File

@@ -0,0 +1,84 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8 customized_allgather.py
# For AMD: MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> GPU_MAX_HW_QUEUES=7 torchrun --nnodes=1 --nproc_per_node=8 customized_allgather.py
import mscclpp
import mscclpp.utils as mscclpp_utils
import torch
import os
import netifaces as ni
import ipaddress
_abs_path = os.path.dirname(os.path.abspath(__file__))
def interfaces_for_ip_netifaces(ip: str):
target = ipaddress.ip_address(ip)
for interface in ni.interfaces():
addresses = ni.ifaddresses(interface)
if ni.AF_INET in addresses:
for link in addresses[ni.AF_INET]:
if "addr" in link:
addr = ipaddress.ip_address(link["addr"])
if addr == target:
return interface
return None
class CustomizedComm:
def __init__(self, comm: mscclpp.CommGroup):
self.comm = comm
self.rank = comm.my_rank
self.world_size = comm.nranks
self.local_rank = comm.my_rank % comm.nranks_per_node
self.n_ranks_per_node = comm.nranks_per_node
self.executor = mscclpp.Executor(comm.communicator)
mscclpp_native = mscclpp.compile_native(
name="mscclpp_native", file=os.path.join(_abs_path, "customized_allgather.cu")
)
capsule = mscclpp_native.create_allgather_algorithm()
self.algorithm = mscclpp.Algorithm.create_from_native_capsule(capsule)
def all_gather(self, tensor: torch.Tensor, out_tensor: torch.Tensor, stream: torch.cuda.Stream = None):
self.algorithm.execute(
self.comm.communicator,
tensor.data_ptr(),
out_tensor.data_ptr(),
tensor.nbytes,
out_tensor.nbytes,
mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
stream=stream.cuda_stream if stream is not None else 0,
)
def barrier_cpu(self):
self.comm.barrier()
def main():
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ.get("LOCAL_RANK", os.environ["RANK"]))
torch.cuda.set_device(local_rank)
master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
master_port = os.environ["MSCCLPP_MASTER_PORT"]
interface = interfaces_for_ip_netifaces(master_addr)
if interface is None:
raise ValueError(f"Cannot find network interface for IP address {master_addr}")
interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world_size)
local_tensor_size = 1 << 20
out_tensor = torch.randn(local_tensor_size * world_size, device="cuda", dtype=torch.float32)
tensor = out_tensor[rank * local_tensor_size : (rank + 1) * local_tensor_size]
comm = CustomizedComm(mscclpp_group)
comm.barrier_cpu()
comm.all_gather(tensor, out_tensor, stream=torch.cuda.current_stream())
torch.cuda.synchronize()
comm = None
print(f"Rank {rank} allgather completed successfully.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,120 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_default_algo.py
import os
import torch
import mscclpp.utils as mscclpp_utils
import mscclpp
import mscclpp.ext
import netifaces as ni
import ipaddress
def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
return collection_builder.build_default_algorithms(
scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
)
def interfaces_for_ip_netifaces(ip: str):
target = ipaddress.ip_address(ip)
for interface in ni.interfaces():
addresses = ni.ifaddresses(interface)
if ni.AF_INET in addresses:
for link in addresses[ni.AF_INET]:
if "addr" in link:
addr = ipaddress.ip_address(link["addr"])
if addr == target:
return interface
return None
def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
if op == torch.distributed.ReduceOp.SUM:
return mscclpp.ReduceOp.SUM
elif op == torch.distributed.ReduceOp.MIN:
return mscclpp.ReduceOp.MIN
else:
raise ValueError(f"unsupported op: {op}")
class CustomizedComm:
def __init__(self, comm: mscclpp.CommGroup):
self.comm = comm
self.rank = comm.my_rank
self.world_size = comm.nranks
self.local_rank = comm.my_rank % comm.nranks_per_node
self.n_ranks_per_node = comm.nranks_per_node
dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
self._algorithm_nvls_packet = [
algo
for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
][0]
self._algorithm_nvls_nonzero_copy = [
algo
for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
][0]
def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
assert op == torch.distributed.ReduceOp.SUM
algo = None
if tensor.nbytes < 1 << 20:
algo = self._algorithm_nvls_packet
else:
algo = self._algorithm_nvls_nonzero_copy
algo.execute(
comm=self.comm.communicator,
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=tensor.nbytes,
output_size=tensor.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
op=to_mscclpp_reduce_op(op),
stream=stream.cuda_stream if stream is not None else 0,
)
def barrier(self):
tensor = torch.empty(1, dtype=torch.float, device=torch.device("cuda"))
self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
def destroy(self):
self._algorithm_nvls_nonzero_copy = None
self._algorithm_nvls_packet = None
self.scratch_buffer = None
self.comm = None
def init_dist() -> CustomizedComm:
rank = int(os.environ["RANK"])
world = int(os.environ["WORLD_SIZE"])
master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
master_port = os.environ["MSCCLPP_MASTER_PORT"]
interface = interfaces_for_ip_netifaces(master_addr)
if interface is None:
raise ValueError(f"Cannot find network interface for IP address {master_addr}")
interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
return CustomizedComm(mscclpp_group)
def main():
local = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local)
comm = init_dist()
comm.barrier()
input_data = torch.randn(1 << 22, dtype=torch.float16, device=torch.device("cuda"))
comm.all_reduce(input_data, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
comm.barrier()
comm.destroy()
print(f"rank {local} All-reduce operation completed successfully.")
if __name__ == "__main__":
main()

View File

@@ -1,12 +1,12 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8 customized_comm.py
# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_dsl.py
import os
import torch
import mscclpp.comm as mscclpp_comm
import mscclpp
import mscclpp.language
from mscclpp.language.collectives import AllReduce
from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
from mscclpp.language.program import CollectiveProgram
@@ -15,13 +15,13 @@ import netifaces as ni
import ipaddress
def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
def allreduce_nvls(spec: mscclpp.language.AlgoSpec) -> CollectiveProgram:
gpu_size = spec.world_size
with CollectiveProgram(
spec.name,
spec.collective,
gpu_size,
instances=8,
instances=spec.instances,
protocol=spec.protocol,
num_threads_per_block=spec.num_threads_per_block,
min_message_size=spec.min_message_size,
@@ -74,14 +74,14 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
return program
def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: int):
spec = mscclpp.AlgoSpec(
def setup_plan(rank: int, world_size: int, nranks_per_node: int):
spec = mscclpp.language.AlgoSpec(
name="allreduce_nvls",
collective=AllReduce(8, 1, True),
nranks_per_node=8,
collective=AllReduce(world_size, 1, True),
nranks_per_node=nranks_per_node,
world_size=world_size,
in_place=True,
instances=2,
instances=nranks_per_node,
protocol="Simple",
num_threads_per_block=1024,
min_message_size=1 << 20,
@@ -89,17 +89,10 @@ def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: i
tags={"nvls": 1},
)
plan_handle = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
registry.register_plan(plan_handle)
def selector(plans, req):
if req.collective != "allreduce":
return None
if req.message_size < 1 << 20:
return None
nvls = [p for p in plans if "nvls" in p.tags]
return nvls[0] if nvls else plans[0]
algorithms = []
algo = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
algorithms.append(algo)
return algorithms
def interfaces_for_ip_netifaces(ip: str):
@@ -129,43 +122,37 @@ def dtype_to_mscclpp_dtype(dtype: torch.dtype) -> mscclpp.DataType:
class CustomizedComm:
def __init__(self, comm: mscclpp_comm.CommGroup):
def __init__(self, comm: mscclpp.CommGroup, algorithms=[]):
self.comm = comm
self.rank = comm.my_rank
self.world_size = comm.nranks
self.local_rank = comm.my_rank % comm.nranks_per_node
self.n_ranks_per_node = comm.nranks_per_node
self.registry = mscclpp.ExecutionPlanRegistry()
self.executor = mscclpp.Executor(comm.communicator)
self.algorithms = algorithms
def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
assert op == torch.distributed.ReduceOp.SUM
plan = self.registry.select(
collective="allreduce",
world_size=self.world_size,
n_ranks_per_node=self.n_ranks_per_node,
send_buffer=tensor.data_ptr(),
recv_buffer=tensor.data_ptr(),
message_size=tensor.numel() * tensor.element_size(),
)
if plan is None:
raise ValueError(
f"No suitable plan found for collective allreduce with message size {tensor.numel() * tensor.element_size()}"
)
self.executor.execute(
self.rank,
tensor.data_ptr(),
tensor.data_ptr(),
tensor.numel() * tensor.element_size(),
tensor.numel() * tensor.element_size(),
dtype_to_mscclpp_dtype(tensor.dtype),
plan.plan,
stream.cuda_stream if stream is not None else 0,
algo: mscclpp.Algorithm = self.algorithms[0]
algo.execute(
comm=self.comm.communicator,
executor=self.executor,
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=tensor.nbytes,
output_size=tensor.nbytes,
dtype=dtype_to_mscclpp_dtype(tensor.dtype),
stream=stream.cuda_stream if stream is not None else 0,
)
def barrier_cpu(self):
self.comm.barrier()
def destroy(self):
self.algorithms = None
self.executor = None
self.comm = None
def init_dist() -> CustomizedComm:
rank = int(os.environ["RANK"])
@@ -175,12 +162,11 @@ def init_dist() -> CustomizedComm:
interface = interfaces_for_ip_netifaces(master_addr)
if interface is None:
raise ValueError(f"Cannot find network interface for IP address {master_addr}")
registry = mscclpp.ExecutionPlanRegistry()
setup_plan(registry, rank, world)
registry.set_selector(selector)
nranks_per_node = int(torch.cuda.device_count())
algorithms = setup_plan(rank, world, nranks_per_node)
interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
mscclpp_group = mscclpp_comm.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
return CustomizedComm(mscclpp_group)
mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
return CustomizedComm(mscclpp_group, algorithms)
def main():
@@ -192,9 +178,11 @@ def main():
dlpack = buffer.to_dlpack(data_type=str(torch.bfloat16))
x = torch.utils.dlpack.from_dlpack(dlpack)
x.normal_()
comm.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
comm.all_reduce(x, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
torch.cuda.synchronize()
comm.barrier_cpu()
comm = None
print(f"Rank {comm.rank} allreduce completed successfully.")
comm.destroy()
if __name__ == "__main__":

View File

@@ -1,9 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# LD_PRELOAD=<MSCCLPP_REPO>/build/apps/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl-torch-integration/dsl_with_nccl_api.py
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
import os
from typing import Any, Dict
import torch, torch.distributed as dist
import mscclpp
from mscclpp.language.collectives import AllReduce
@@ -62,7 +63,7 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
return program
def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: int):
def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
spec = mscclpp.AlgoSpec(
name="allreduce_nvls",
collective=AllReduce(8, 1, True),
@@ -77,27 +78,26 @@ def setup_plan(registry: mscclpp.ExecutionPlanRegistry, rank: int, world_size: i
tags={"nvls": 1},
)
plan_handle = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
registry.register_plan(plan_handle)
algo = mscclpp.compile(algo=allreduce_nvls, algo_spec=spec, rank=rank)
algo_collection_builder.add_algorithm_builder(algo)
def selector(plans, req):
def selector(algorithms: Dict[str, Any], req):
if req.collective != "allreduce":
return None
if req.message_size < 1 << 20:
return None
nvls = [p for p in plans if "nvls" in p.tags]
return nvls[0] if nvls else plans[0]
return algorithms["allreduce"]["allreduce_nvls"]
def init_dist():
rank = int(os.environ["RANK"])
world = int(os.environ["WORLD_SIZE"])
local = int(os.environ["LOCAL_RANK"])
registry = mscclpp.ExecutionPlanRegistry()
setup_plan(registry, rank, world)
registry.set_selector(selector)
dist.init_process_group(backend="nccl")
algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
setup_plan(algorithm_collection_builder, rank, world)
algorithm_collection_builder.set_algorithm_selector(selector)
dist.init_process_group(backend="nccl", device_id=local)
return rank, world, local
@@ -111,6 +111,7 @@ def main():
dist.all_reduce(x, op=dist.ReduceOp.SUM)
dist.barrier()
dist.destroy_process_group()
print(f"Rank {local} allreduce completed successfully.")
if __name__ == "__main__":

View File

@@ -7,3 +7,6 @@ target_sources(
BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
FILES ${HEADERS}
)
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp DESTINATION include)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/version.hpp DESTINATION include/mscclpp)

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#ifndef MSCCLPP_ALGORITHM_HPP_
#define MSCCLPP_ALGORITHM_HPP_
@@ -14,26 +14,117 @@
namespace mscclpp {
class AlgorithmCtx {
public:
int rank;
int workSize;
int nRanksPerNode;
/// Capsule name for native algorithm pointers used in Python bindings.
constexpr char ALGORITHM_NATIVE_CAPSULE_NAME[] = "mscclpp::AlgorithmPtr";
std::vector<mscclpp::RegisteredMemory> registeredMemories;
std::vector<mscclpp::MemoryChannel> memoryChannels;
std::vector<mscclpp::SwitchChannel> switchChannels;
std::vector<mscclpp::PortChannel> portChannels;
std::vector<std::shared_ptr<mscclpp::NvlsConnection>> nvlsConnections;
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SwitchChannel>> switchChannelDeviceHandles;
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::PortChannel>> portChannelDeviceHandles;
std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
std::vector<std::shared_ptr<mscclpp::Host2DeviceSemaphore>> hostSemaphores;
std::unordered_map<std::string, std::shared_ptr<void>> extras;
enum class CollectiveBufferMode {
Any = 0,
InPlace,
OutOfPlace,
};
enum class AlgorithmType {
Native = 0,
DSL,
};
enum class CommResult {
CommSuccess = 0,
CommUnhandledCudaError = 1,
CommSystemError = 2,
CommInternalError = 3,
CommInvalidArgument = 4,
CommInvalidUsage = 5,
CommRemoteError = 6,
CommInProgress = 7,
CommNumResults = 8
};
enum ReduceOp { SUM = 0, MIN = 3, NOP = 255 };
/// Base class for collective communication algorithms.
///
/// This abstract class defines the interface for implementing collective communication
/// algorithms such as allreduce, allgather, and reduce-scatter. Concrete implementations
/// can be either native C++/CUDA algorithms or DSL-defined algorithms.
class Algorithm {
public:
struct Constraint {
int worldSize;
int nRanksPerNode;
};
virtual ~Algorithm() = default;
/// Get the name of the algorithm.
/// @return A reference to the algorithm name string.
virtual const std::string& name() const = 0;
/// Get the collective operation this algorithm implements.
/// @return A reference to the collective name (e.g., "allreduce", "allgather").
virtual const std::string& collective() const = 0;
/// Get the valid message size range for this algorithm.
/// @return A pair of (minMessageSize, maxMessageSize) in bytes.
virtual const std::pair<size_t, size_t>& messageRange() const = 0;
/// Get the tags associated with this algorithm.
/// @return An unordered map of tag names to tag values.
virtual const std::unordered_map<std::string, uint64_t>& tags() const = 0;
/// Get the buffer mode supported by this algorithm.
/// @return The CollectiveBufferMode indicating in-place, out-of-place, or any.
virtual const CollectiveBufferMode& bufferMode() const = 0;
/// Get the type of this algorithm.
/// @return AlgorithmType::Native or AlgorithmType::DSL.
virtual AlgorithmType type() const = 0;
/// Get the execution constraints for this algorithm.
/// @return The Constraint struct specifying worldSize and nRanksPerNode requirements.
virtual Constraint constraint() const = 0;
/// Execute the algorithm.
/// @param comm The communicator to use.
/// @param input Pointer to the input buffer.
/// @param output Pointer to the output buffer.
/// @param inputSize Size of the input buffer in bytes.
/// @param outputSize Size of the output buffer in bytes.
/// @param dtype The data type of the elements.
/// @param op The reduction operation (for reduce-type collectives).
/// @param stream The CUDA stream to execute on.
/// @param executor The executor for DSL algorithms (may be nullptr for native).
/// @param nBlocks Number of CUDA blocks (0 for auto-selection).
/// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
/// @param extras Additional parameters for algorithm-specific customization.
/// @return The result of the operation.
virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
/// Reset the algorithm state, clearing any cached contexts.
virtual void reset() = 0;
};
/// Interface for building Algorithm instances.
///
/// Implement this interface to create custom algorithm factories that can be
/// registered with the AlgorithmCollectionBuilder.
class AlgorithmBuilder {
public:
virtual ~AlgorithmBuilder() = default;
/// Build and return an Algorithm instance.
/// @return A shared pointer to the constructed Algorithm.
virtual std::shared_ptr<Algorithm> build() = 0;
};
/// Key for identifying cached AlgorithmCtx instances.
///
/// The context key uniquely identifies a buffer configuration, allowing
/// the algorithm to cache and reuse contexts for repeated operations with
/// the same buffers.
struct AlgorithmCtxKey {
void* baseSendBuff;
void* baseRecvBuff;
@@ -47,42 +138,6 @@ struct AlgorithmCtxKey {
}
};
class AlgorithmImpl;
class Algorithm {
public:
using InitFunc = std::function<void(std::shared_ptr<mscclpp::Communicator>,
std::unordered_map<std::string, std::shared_ptr<void>>&)>;
using KernelFunc = std::function<int(const std::shared_ptr<AlgorithmCtx>, const void*, void*, size_t, DataType,
cudaStream_t, std::unordered_map<std::string, std::shared_ptr<void>>&)>;
using ContextInitFunc = std::function<std::shared_ptr<AlgorithmCtx>(std::shared_ptr<mscclpp::Communicator>,
const void*, void*, size_t, DataType)>;
using ContextKeyGenFunc =
std::function<AlgorithmCtxKey(const void* input, void* output, size_t count, DataType dtype)>;
Algorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc);
Algorithm() = default;
/// @brief Launch the algorithm.
/// @param comm The communicator.
/// @param input The input buffer.
/// @param output The output buffer.
/// @param count The number of elements.
/// @param dtype The data type.
/// @param stream The CUDA stream.
/// @details This method will call ContextKeyGenFunc to generate a context key based on the input parameters,
/// and then use the context key to retrieve or create an AlgorithmCtx. The kernel function
/// will be launched with the AlgorithmCtx.
int launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count, DataType dtype,
cudaStream_t stream, std::unordered_map<std::string, std::shared_ptr<void>>& extras);
bool isEmpty();
std::string name() const;
std::string collective() const;
private:
class Impl;
std::shared_ptr<Impl> impl_;
};
} // namespace mscclpp
namespace std {
@@ -103,72 +158,202 @@ struct hash<mscclpp::AlgorithmCtxKey> {
namespace mscclpp {
class AlgorithmBuilder {
/// Native C++/CUDA implementation of a collective algorithm.
///
/// NativeAlgorithm allows users to implement custom collective algorithms in C++/CUDA.
/// It provides a framework for initialization, context management, and kernel execution.
/// Contexts are cached based on buffer configurations to avoid redundant setup.
class NativeAlgorithm : public Algorithm {
public:
virtual ~AlgorithmBuilder() = default;
virtual Algorithm build() = 0;
using InitFunc = std::function<void(std::shared_ptr<Communicator>)>;
/// Function type for the kernel that executes the collective operation.
/// @param ctx The algorithm context containing channels and semaphores.
/// @param input Pointer to the input buffer.
/// @param output Pointer to the output buffer.
/// @param inputSize Size of the input buffer in bytes.
/// @param outputSize Size of the output buffer in bytes.
/// @param dtype Data type of the elements.
/// @param op Reduction operation (for reduce-type collectives).
/// @param stream CUDA stream to execute on.
/// @param nBlocks Number of CUDA blocks.
/// @param nThreadsPerBlock Number of threads per block.
/// @param extras Additional algorithm-specific parameters.
/// @return The result of the operation.
using KernelFunc =
std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
/// Function type for creating algorithm contexts.
/// @param comm The communicator.
/// @param input Pointer to the input buffer.
/// @param output Pointer to the output buffer.
/// @param inputSize Size of the input buffer.
/// @param outputSize Size of the output buffer.
/// @param dtype Data type of the elements.
/// @return A shared pointer to the created context.
using ContextInitFunc =
std::function<std::shared_ptr<void>(std::shared_ptr<Communicator>, const void*, void*, size_t, size_t, DataType)>;
/// Function type for generating context keys.
/// @param input Pointer to the input buffer.
/// @param output Pointer to the output buffer.
/// @param inputSize Size of the input buffer.
/// @param outputSize Size of the output buffer.
/// @param dtype Data type of the elements.
/// @return A key uniquely identifying this buffer configuration.
using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype)>;
/// Construct a NativeAlgorithm.
/// @param name Human-readable name of the algorithm.
/// @param collective The collective operation (e.g., "allreduce").
/// @param initFunc Function called once to initialize the algorithm.
/// @param kernelFunc Function that launches the CUDA kernel.
/// @param contextInitFunc Function that creates execution contexts.
/// @param contextKeyGenFunc Function that generates cache keys for contexts.
/// @param minMessageSize Minimum supported message size in bytes (default: 0).
/// @param maxMessageSize Maximum supported message size in bytes (default: UINT64_MAX).
/// @param bufferMode Buffer mode supported by this algorithm (default: ANY).
/// @param tags Tags for algorithm selection hints.
/// @param constraint Execution constraints (worldSize, nRanksPerNode).
NativeAlgorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc, size_t minMessageSize = 0,
size_t maxMessageSize = UINT64_MAX, CollectiveBufferMode bufferMode = CollectiveBufferMode::Any,
std::unordered_map<std::string, uint64_t> tags = {}, Constraint constraint = {});
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
const std::string& name() const override;
const std::string& collective() const override;
const std::pair<size_t, size_t>& messageRange() const override;
const std::unordered_map<std::string, uint64_t>& tags() const override;
const CollectiveBufferMode& bufferMode() const override;
AlgorithmType type() const override { return AlgorithmType::Native; }
Constraint constraint() const override;
void reset() override;
private:
std::string name_;
std::string collective_;
NativeAlgorithm::InitFunc initFunc_;
NativeAlgorithm::KernelFunc kernelLaunchFunc_;
NativeAlgorithm::ContextInitFunc contextInitFunc_;
NativeAlgorithm::ContextKeyGenFunc contextKeyGenFunc_;
size_t minMessageSize_;
size_t maxMessageSize_;
CollectiveBufferMode bufferMode_;
std::unordered_map<std::string, uint64_t> tags_;
Constraint constraint_;
std::unordered_map<AlgorithmCtxKey, std::shared_ptr<void>> contexts_;
bool initialized_ = false;
};
using AlgoSelectFunc = std::function<Algorithm(
const std::unordered_map<std::string, std::unordered_map<std::string, Algorithm>>& algoMapByCollective,
std::string collective, const void* input, void* output, size_t messageSize, DataType dtype, int nRanksPerNode,
int worldSize)>;
/// DSL-based implementation of a collective algorithm.
///
/// DslAlgorithm wraps an ExecutionPlan loaded from a DSL specification file.
/// It implements both Algorithm and AlgorithmBuilder interfaces, allowing it
/// to be used directly or registered with AlgorithmCollectionBuilder.
class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enable_shared_from_this<DslAlgorithm> {
public:
/// Construct a DslAlgorithm from an execution plan.
/// @param id Identifier for this algorithm instance.
/// @param plan The execution plan defining the algorithm.
/// @param tags Tags for algorithm selection hints.
/// @param constraint Execution constraints (worldSize, nRanksPerNode).
DslAlgorithm(std::string id, ExecutionPlan plan, std::unordered_map<std::string, uint64_t> tags = {},
Constraint constraint = {});
const std::string& name() const override;
const std::string& collective() const override;
const std::pair<size_t, size_t>& messageRange() const override;
const std::unordered_map<std::string, uint64_t>& tags() const override;
const CollectiveBufferMode& bufferMode() const override;
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
AlgorithmType type() const override { return AlgorithmType::DSL; }
Constraint constraint() const override;
void reset() override;
std::shared_ptr<Algorithm> build() override;
private:
ExecutionPlan plan_;
std::string id_;
std::unordered_map<std::string, uint64_t> tags_;
Constraint constraint_;
};
/// Request parameters for selecting and executing a collective operation.
///
/// This struct encapsulates all the information needed to select an appropriate
/// algorithm for a collective operation.
struct CollectiveRequest {
int worldSize;
int nRanksPerNode;
int rank;
const void* inputBuffer;
void* outputBuffer;
size_t messageSize;
const std::string& collective;
const DataType dtype;
const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
CollectiveBufferMode bufferMode() const;
};
/// Function type for custom algorithm selection.
/// @param algoMapByCollective Map of collective names to available algorithms.
/// @param request The collective request parameters.
/// @return The selected algorithm, or nullptr if no suitable algorithm is found.
using AlgoSelectFunc = std::function<std::shared_ptr<Algorithm>(
const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<Algorithm>>>&
algoMapByCollective,
const CollectiveRequest& request)>;
/// Collection of algorithms for collective operations.
///
/// AlgorithmCollection manages a set of algorithms indexed by collective operation
/// name and algorithm name. It provides methods to select the best algorithm for
/// a given request and to register new algorithms.
class AlgorithmCollection {
public:
AlgorithmCollection() = default;
/// @brief Select an algorithm based on the collective operation name and message size.
/// @param collective The collective operation name.
/// @param input The input buffer.
/// @param output The output buffer.
/// @param messageSize The message size.
/// @param dtype The data type.
/// @param nRanksPerNode The number of ranks per node.
/// @param worldSize The total number of ranks.
/// @return The selected algorithm. If no suitable algorithm is found, an empty Algorithm object is returned.
Algorithm selectAlgorithm(const std::string& collective, const void* input, void* output, size_t messageSize,
DataType dtype, int nRanksPerNode, int worldSize);
/// Select an algorithm based on the collective operation name and message size.
/// @param request The collective request containing all necessary parameters.
/// @return The selected algorithm. If no suitable algorithm is found, nullptr is returned.
std::shared_ptr<Algorithm> selectAlgorithm(const CollectiveRequest& request);
/// @brief Register a new algorithm.
/// @param collective The collective operation name.
/// Register a new algorithm.
/// @param collective The collective operation name (e.g., "allreduce").
/// @param algoName The algorithm name.
/// @param algorithm The algorithm implementation.
void registerAlgorithm(const std::string collective, const std::string algoName, Algorithm algorithm);
void registerAlgorithm(const std::string collective, const std::string algoName,
std::shared_ptr<Algorithm> algorithm);
/// Get all algorithms for a specific collective operation.
/// @param collective The collective operation name.
/// @return A map of algorithm names to algorithm instances.
std::unordered_map<std::string, std::shared_ptr<Algorithm>> getAlgorithmsByCollective(
const std::string& collective) const;
/// Get all registered algorithms.
/// @return A vector containing all algorithm instances.
std::vector<std::shared_ptr<Algorithm>> getAllAlgorithms() const;
/// Extend this collection with algorithms from another collection.
/// @param other The other AlgorithmCollection to merge in.
void extend(const AlgorithmCollection& other);
void setSelectors(AlgoSelectFunc algoSelector, AlgoSelectFunc fallbackAlgoSelector);
private:
std::unordered_map<std::string, std::unordered_map<std::string, Algorithm>> algoMapByCollective_;
AlgoSelectFunc algoSelector_ = nullptr;
AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
friend class AlgorithmCollectionBuilder;
};
class AlgorithmCollectionBuilder {
public:
static std::shared_ptr<AlgorithmCollectionBuilder> getInstance();
/// @brief Add a new algorithm builder for a specific collective operation.
/// @param builder The algorithm builder.
void addAlgorithmBuilder(std::shared_ptr<AlgorithmBuilder> builder);
/// @brief Set a new algorithm selection function.
/// @param selector The algorithm selection function.
void setAlgorithmSelector(AlgoSelectFunc selector);
/// @brief Set a fallback algorithm selection function.
/// @param selector The fallback algorithm selection function.
/// @details The fallback selector will be used if the primary selector returns an empty algorithm. MSCCL++ will
/// assign a predefined selector as the fallback selector.
void setFallbackAlgorithmSelector(AlgoSelectFunc selector);
/// @brief Build the AlgorithmCollection instance.
/// @return The AlgorithmCollection instance.
std::shared_ptr<AlgorithmCollection> build();
private:
AlgorithmCollectionBuilder() = default;
std::vector<std::shared_ptr<AlgorithmBuilder>> algoBuilders_;
std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<Algorithm>>> algoMapByCollective_;
AlgoSelectFunc algoSelector_ = nullptr;
AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
};

View File

@@ -381,12 +381,12 @@ struct EndpointConfig {
/// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other
/// transports.
struct Ib {
static const int DefaultPort = -1;
static const int DefaultGidIndex = 0;
static const int DefaultMaxCqSize = 1024;
static const int DefaultMaxCqPollNum = 1;
static const int DefaultMaxSendWr = 8192;
static const int DefaultMaxWrPerSend = 64;
static constexpr int DefaultPort = -1;
static constexpr int DefaultGidIndex = 0;
static constexpr int DefaultMaxCqSize = 1024;
static constexpr int DefaultMaxCqPollNum = 1;
static constexpr int DefaultMaxSendWr = 8192;
static constexpr int DefaultMaxWrPerSend = 64;
/// Device index. Currently ignored; use transport type (IB0-IB7) to select device.
int deviceIndex;

View File

@@ -33,10 +33,10 @@ class ExecutionPlan {
~ExecutionPlan() = default;
/// Return the human-readable name of the plan.
std::string name() const;
const std::string& name() const;
/// Return the collective implemented by this plan (e.g., "allreduce", "allgather").
std::string collective() const;
const std::string& collective() const;
/// Minimum message size (in bytes) for which this plan is valid.
size_t minMessageSize() const;
@@ -54,95 +54,6 @@ class ExecutionPlan {
friend class Executor;
};
/// Request parameters provided when executing a plan.
struct ExecutionRequest {
int worldSize;
int nRanksPerNode;
int rank;
const void* inputBuffer;
void* outputBuffer;
size_t messageSize;
const std::string& collective;
const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
/// Whether the request indicates an in-place operation.
bool isInPlace() const;
};
/// A handle representing a specific execution plan along with its constraints and metadata.
struct ExecutionPlanHandle {
/// Constraints that must be satisfied for the plan to be valid.
struct Constraint {
int worldSize;
int nRanksPerNode;
};
std::string id; /// Unique identifier for the handle.
Constraint constraint; /// Constraints for plan applicability.
std::shared_ptr<ExecutionPlan> plan; /// Backing ExecutionPlan instance.
std::unordered_map<std::string, uint64_t> tags; /// Optional tags/metadata used by selector.
/// Create a new ExecutionPlanHandle.
/// @param id Unique id for the handle.
/// @param worldSize Required world size for the plan.
/// @param nRanksPerNode Required ranks-per-node for the plan.
/// @param plan The associated ExecutionPlan.
/// @param tags Optional tags used for selection.
static std::shared_ptr<ExecutionPlanHandle> create(const std::string& id, int worldSize, int nRanksPerNode,
std::shared_ptr<ExecutionPlan> plan,
const std::unordered_map<std::string, uint64_t>& tags = {});
/// Check whether the given ExecutionRequest satisfies this handle's parameters.
/// @param request The execution request to evaluate.
/// @return True if the request matches the handle parameters, false otherwise.
bool match(const ExecutionRequest& request);
};
/// Selector function type used to pick an ExecutionPlanHandle from a list of candidates.
using ExecutionPlanSelector = std::function<std::shared_ptr<ExecutionPlanHandle>(
const std::vector<std::shared_ptr<ExecutionPlanHandle>> plans, const ExecutionRequest& request)>;
/// Registry that holds available execution plans and performs selection logic.
class ExecutionPlanRegistry {
public:
/// Retrieve the singleton instance of the registry.
static std::shared_ptr<ExecutionPlanRegistry> getInstance();
/// Destructor.
~ExecutionPlanRegistry();
/// Register a plan handle with the registry.
void registerPlan(const std::shared_ptr<ExecutionPlanHandle> planHandle);
/// Get all plan handles for a given collective name.
std::vector<std::shared_ptr<ExecutionPlanHandle>> getPlans(const std::string& collective);
/// Lookup a plan handle by id.
std::shared_ptr<ExecutionPlanHandle> get(const std::string& id);
/// Select a suitable plan handle for the given parameters.
std::shared_ptr<ExecutionPlanHandle> select(const std::string& collective, int worldSize, int nRanksPerNode, int rank,
const void* sendBuffer, void* recvBuffer, size_t messageSize,
const std::unordered_map<std::string, std::vector<uint64_t>>& hints);
/// Provide a custom selector function.
void setSelector(ExecutionPlanSelector selector);
/// Set the default selector used when no custom selector is provided.
void setDefaultSelector(ExecutionPlanSelector selector);
/// Load built-in/default plans for the given rank.
void loadDefaultPlans(int rank);
/// Clear all registered plans from the registry.
void clear();
private:
struct Impl;
std::unique_ptr<Impl> impl_;
ExecutionPlanRegistry();
};
/// High-level executor responsible for invoking execution plans on a communicator.
class Executor {
public:

View File

@@ -0,0 +1,67 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#ifndef MSCCLPP_EXT_COLLECTIVES_ALGORITHM_COLLECTION_BUILDER_HPP_
#define MSCCLPP_EXT_COLLECTIVES_ALGORITHM_COLLECTION_BUILDER_HPP_
#include <mscclpp/algorithm.hpp>
namespace mscclpp {
namespace collective {
/// Builder for creating AlgorithmCollection instances.
///
/// AlgorithmCollectionBuilder provides a singleton interface for registering
/// algorithm builders and configuring algorithm selection functions. It can
/// build both default algorithms and custom algorithms registered by users.
///
/// Typical usage:
/// 1. Get the singleton instance with getInstance()
/// 2. Add algorithm builders with addAlgorithmBuilder()
/// 3. Optionally set custom selectors with setAlgorithmSelector()
/// 4. Build the collection with build() or buildDefaultAlgorithms()
class AlgorithmCollectionBuilder {
public:
/// Get the singleton instance of the builder.
/// @return A shared pointer to the singleton instance.
static std::shared_ptr<AlgorithmCollectionBuilder> getInstance();
/// Reset the singleton instance.
static void reset();
/// Add a new algorithm builder.
/// @param builder The algorithm builder to add.
void addAlgorithmBuilder(std::shared_ptr<AlgorithmBuilder> builder);
/// Set a custom algorithm selection function.
/// @param selector The algorithm selection function.
void setAlgorithmSelector(AlgoSelectFunc selector);
/// Set a fallback algorithm selection function.
/// @param selector The fallback algorithm selection function.
/// @note The fallback selector is used if the primary selector returns nullptr.
/// MSCCL++ assigns a predefined selector as the fallback by default.
void setFallbackAlgorithmSelector(AlgoSelectFunc selector);
/// Build the AlgorithmCollection instance.
/// @return The built AlgorithmCollection containing all registered algorithms.
AlgorithmCollection build();
AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
private:
AlgorithmCollectionBuilder() = default;
std::vector<std::shared_ptr<AlgorithmBuilder>> algoBuilders_;
AlgoSelectFunc algoSelector_ = nullptr;
AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
AlgorithmCollection buildDefaultDslAlgorithms(int rank);
static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;
};
} // namespace collective
} // namespace mscclpp
#endif // MSCCLPP_EXT_COLLECTIVES_ALGORITHM_COLLECTION_BUILDER_HPP_

View File

@@ -6,10 +6,10 @@ add_subdirectory(test)
add_custom_target(pytest_lib_copy ALL
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_BINARY_DIR}/csrc/_mscclpp.*.so
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
DEPENDS mscclpp_py mscclpp_py_test
)

View File

@@ -3,7 +3,7 @@
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
include(FetchContent)
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2)
FetchContent_MakeAvailable(nanobind)
FetchContent_Declare(dlpack
@@ -21,6 +21,7 @@ endif()
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
nanobind_add_module(mscclpp_py ${SOURCES})
set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp_static ${GPU_LIBRARIES})
set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
install(TARGETS mscclpp_py LIBRARY DESTINATION .)

113
python/csrc/algorithm.cpp Normal file
View File

@@ -0,0 +1,113 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include <nanobind/nanobind.h>
#include <nanobind/stl/function.h>
#include <nanobind/stl/pair.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/vector.h>
#include <cstring>
#include <mscclpp/algorithm.hpp>
namespace nb = nanobind;
using namespace mscclpp;
void register_algorithm(nb::module_& m) {
nb::enum_<CollectiveBufferMode>(m, "CollectiveBufferMode")
.value("ANY", CollectiveBufferMode::Any)
.value("IN_PLACE", CollectiveBufferMode::InPlace)
.value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace);
nb::enum_<AlgorithmType>(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL);
nb::enum_<CommResult>(m, "CommResult")
.value("COMM_SUCCESS", CommResult::CommSuccess)
.value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError)
.value("COMM_SYSTEM_ERROR", CommResult::CommSystemError)
.value("COMM_INTERNAL_ERROR", CommResult::CommInternalError)
.value("COMM_INVALID_ARGUMENT", CommResult::CommInvalidArgument)
.value("COMM_INVALID_USAGE", CommResult::CommInvalidUsage)
.value("COMM_REMOTE_ERROR", CommResult::CommRemoteError)
.value("COMM_IN_PROGRESS", CommResult::CommInProgress)
.value("COMM_NUM_RESULTS", CommResult::CommNumResults);
nb::enum_<ReduceOp>(m, "ReduceOp")
.value("SUM", ReduceOp::SUM)
.value("MIN", ReduceOp::MIN)
.value("NOP", ReduceOp::NOP);
auto algorithmClass =
nb::class_<Algorithm>(m, "Algorithm")
.def_static(
"from_native_capsule",
[](nb::capsule cap) {
const char* name = cap.name();
if (name == nullptr || std::strcmp(name, ALGORITHM_NATIVE_CAPSULE_NAME) != 0) {
throw nb::type_error("Invalid capsule: expected 'mscclpp::AlgorithmPtr'");
}
void* data = cap.data();
if (data == nullptr) {
throw nb::value_error("Failed to get pointer from capsule");
}
return *static_cast<std::shared_ptr<Algorithm>*>(data);
},
nb::arg("capsule"))
.def_prop_ro("name", &Algorithm::name)
.def_prop_ro("collective", &Algorithm::collective)
.def_prop_ro("message_range", &Algorithm::messageRange)
.def_prop_ro("tags", &Algorithm::tags)
.def_prop_ro("buffer_mode", &Algorithm::bufferMode)
.def_prop_ro("constraint", &Algorithm::constraint)
.def_prop_ro("type", &Algorithm::type)
.def(
"execute",
[](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
std::unordered_map<std::string, uintptr_t> extras) {
return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
nBlocks, nThreadsPerBlock, extras);
},
nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
.def(nb::init<>())
.def(nb::init<int, int>(), nb::arg("world_size"), nb::arg("n_ranks_per_node"))
.def_rw("world_size", &Algorithm::Constraint::worldSize)
.def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode);
nb::class_<AlgorithmBuilder>(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build);
nb::class_<DslAlgorithm, Algorithm>(m, "DslAlgorithm")
.def(nb::init<std::string, ExecutionPlan, std::unordered_map<std::string, uint64_t>, Algorithm::Constraint>(),
nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map<std::string, uint64_t>(),
nb::arg("constraint") = Algorithm::Constraint())
.def("build", &DslAlgorithm::build);
nb::class_<AlgorithmCollection>(m, "AlgorithmCollection")
.def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"),
nb::arg("algorithm"))
.def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective"))
.def("to_list", &AlgorithmCollection::getAllAlgorithms);
nb::class_<CollectiveRequest>(m, "CollectiveRequest")
.def_ro("world_size", &CollectiveRequest::worldSize)
.def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode)
.def_ro("rank", &CollectiveRequest::rank)
.def_prop_ro("input_buffer",
[](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.inputBuffer); })
.def_prop_ro("output_buffer",
[](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
.def_ro("message_size", &CollectiveRequest::messageSize)
.def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
.def_ro("dtype", &CollectiveRequest::dtype)
.def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
.def("buffer_mode", &CollectiveRequest::bufferMode);
}

View File

@@ -3,7 +3,6 @@
#include <nanobind/nanobind.h>
#include <nanobind/operators.h>
#include <nanobind/stl/array.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/vector.h>
@@ -26,6 +25,10 @@ extern void register_nvls(nb::module_& m);
extern void register_executor(nb::module_& m);
extern void register_npkit(nb::module_& m);
extern void register_gpu_utils(nb::module_& m);
extern void register_algorithm(nb::module_& m);
// ext
extern void register_algorithm_collection_builder(nb::module_& m);
template <typename T>
void def_shared_future(nb::handle& m, const std::string& typestr) {
@@ -36,6 +39,13 @@ void def_shared_future(nb::handle& m, const std::string& typestr) {
void register_core(nb::module_& m) {
m.def("version", &version);
nb::enum_<DataType>(m, "DataType")
.value("int32", DataType::INT32)
.value("uint32", DataType::UINT32)
.value("float16", DataType::FLOAT16)
.value("float32", DataType::FLOAT32)
.value("bfloat16", DataType::BFLOAT16);
nb::class_<Bootstrap>(m, "Bootstrap")
.def("get_rank", &Bootstrap::getRank)
.def("get_n_ranks", &Bootstrap::getNranks)
@@ -61,7 +71,15 @@ void register_core(nb::module_& m) {
.def("recv", static_cast<void (Bootstrap::*)(std::vector<char>&, int, int)>(&Bootstrap::recv), nb::arg("data"),
nb::arg("peer"), nb::arg("tag"));
nb::class_<UniqueId>(m, "UniqueId");
nb::class_<UniqueId>(m, "UniqueId")
.def(nb::init<>())
.def("__setstate__",
[](UniqueId& self, nb::bytes b) {
if (nb::len(b) != UniqueIdBytes) throw std::runtime_error("Invalid UniqueId byte size");
::memcpy(self.data(), b.c_str(), UniqueIdBytes);
})
.def("__getstate__",
[](const UniqueId& self) { return nb::bytes(reinterpret_cast<const char*>(self.data()), UniqueIdBytes); });
nb::class_<TcpBootstrap, Bootstrap>(m, "TcpBootstrap")
.def(nb::init<int, int>(), "Do not use this constructor. Use create instead.")
@@ -284,4 +302,8 @@ NB_MODULE(_mscclpp, m) {
register_executor(m);
register_npkit(m);
register_gpu_utils(m);
register_algorithm(m);
// ext
register_algorithm_collection_builder(m);
}

View File

@@ -15,50 +15,8 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_executor(nb::module_& m) {
nb::enum_<DataType>(m, "DataType")
.value("int32", DataType::INT32)
.value("uint32", DataType::UINT32)
.value("float16", DataType::FLOAT16)
.value("float32", DataType::FLOAT32)
.value("bfloat16", DataType::BFLOAT16);
nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
nb::class_<ExecutionRequest>(m, "ExecutionRequest")
.def_ro("world_size", &ExecutionRequest::worldSize)
.def_ro("n_ranks_per_node", &ExecutionRequest::nRanksPerNode)
.def_prop_ro(
"input_buffer",
[](const ExecutionRequest& self) -> uintptr_t { return reinterpret_cast<uintptr_t>(self.inputBuffer); })
.def_prop_ro(
"output_buffer",
[](const ExecutionRequest& self) -> uintptr_t { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
.def_ro("message_size", &ExecutionRequest::messageSize)
.def_prop_ro("collective", [](ExecutionRequest& self) -> const std::string& { return self.collective; })
.def_prop_ro("hints", [](ExecutionRequest& self) { return self.hints; });
nb::class_<ExecutionPlanHandle>(m, "ExecutionPlanHandle")
.def_ro("id", &ExecutionPlanHandle::id)
.def_ro("constraint", &ExecutionPlanHandle::constraint)
.def_ro("plan", &ExecutionPlanHandle::plan)
.def_ro("tags", &ExecutionPlanHandle::tags)
.def_static("create", &ExecutionPlanHandle::create, nb::arg("id"), nb::arg("world_size"),
nb::arg("nranks_per_node"), nb::arg("plan"),
nb::arg("tags") = std::unordered_map<std::string, uint64_t>{});
nb::class_<ExecutionPlanHandle::Constraint>(m, "ExecutionPlanConstraint")
.def_ro("world_size", &ExecutionPlanHandle::Constraint::worldSize)
.def_ro("n_ranks_per_node", &ExecutionPlanHandle::Constraint::nRanksPerNode);
nb::class_<ExecutionPlanRegistry>(m, "ExecutionPlanRegistry")
.def_static("get_instance", &ExecutionPlanRegistry::getInstance)
.def("register_plan", &ExecutionPlanRegistry::registerPlan, nb::arg("planHandle"))
.def("get_plans", &ExecutionPlanRegistry::getPlans, nb::arg("collective"))
.def("get", &ExecutionPlanRegistry::get, nb::arg("id"))
.def("set_selector", &ExecutionPlanRegistry::setSelector, nb::arg("selector"))
.def("set_default_selector", &ExecutionPlanRegistry::setDefaultSelector, nb::arg("selector"))
.def("clear", &ExecutionPlanRegistry::clear);
nb::class_<ExecutionPlan>(m, "ExecutionPlan")
.def(nb::init<const std::string&, int>(), nb::arg("planPath"), nb::arg("rank"))
.def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); })

View File

@@ -0,0 +1,34 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include <nanobind/nanobind.h>
#include <nanobind/stl/function.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/vector.h>
#include <mscclpp/algorithm.hpp>
#include <mscclpp/ext/collectives/algorithm_collection_builder.hpp>
namespace nb = nanobind;
using namespace mscclpp;
using namespace mscclpp::collective;
void register_algorithm_collection_builder(nb::module_& m) {
nb::class_<AlgorithmCollectionBuilder>(m, "AlgorithmCollectionBuilder")
.def_static("get_instance", &AlgorithmCollectionBuilder::getInstance)
.def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder"))
.def(
"add_dsl_algorithm_builder",
[](AlgorithmCollectionBuilder& self, std::shared_ptr<DslAlgorithm> algorithm) {
self.addAlgorithmBuilder(algorithm);
},
nb::arg("algorithm"))
.def("set_algorithm_selector", &AlgorithmCollectionBuilder::setAlgorithmSelector, nb::arg("selector"))
.def("set_fallback_algorithm_selector", &AlgorithmCollectionBuilder::setFallbackAlgorithmSelector,
nb::arg("selector"))
.def("build", &AlgorithmCollectionBuilder::build)
.def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
nb::arg("scratch_buffer_size"), nb::arg("rank"))
.def_static("reset", &AlgorithmCollectionBuilder::reset);
}

View File

@@ -3,43 +3,26 @@
"""MSCCL++ Python API."""
import atexit
from dataclasses import dataclass
from functools import cached_property, wraps
import inspect
import json
from functools import wraps
import os
from pathlib import Path
from typing import Any
import warnings
from blake3 import blake3
from mscclpp.language.program import CollectiveProgram
from mscclpp.language.utils import AlgoSpec
from functools import wraps
from mscclpp._version import __version__, __commit_id__
from ._version import __version__, __commit_id__
if os.environ.get("MSCCLPP_HOME", None) is None:
os.environ["MSCCLPP_HOME"] = os.path.abspath(os.path.dirname(__file__))
# Parse the version
version = {
"version": __version__,
"git_commit": __commit_id__,
}
from ._core import *
from ._mscclpp import (
Env,
ErrorCode,
BaseError,
Error,
SysError,
CudaError,
CuError,
IbError,
Device,
DeviceType,
Communicator,
@@ -60,16 +43,15 @@ from ._mscclpp import (
Transport,
TransportFlags,
DataType,
ErrorCode,
Executor,
ExecutionPlan,
ExecutionPlanConstraint,
PacketType,
RawGpuBuffer,
ReduceOp,
env,
is_nvls_supported,
npkit,
ExecutionPlanHandle as _ExecutionPlanHandle,
ExecutionPlanRegistry as _ExecutionPlanRegistry,
)
__all__ = [
@@ -79,6 +61,7 @@ __all__ = [
"Connection",
"connect_nvls_collective",
"EndpointConfig",
"ErrorCode",
"Fifo",
"Semaphore",
"Host2DeviceSemaphore",
@@ -97,6 +80,7 @@ __all__ = [
"ExecutionPlan",
"PacketType",
"RawGpuBuffer",
"ReduceOp",
"env",
"version",
"is_nvls_supported",
@@ -107,6 +91,11 @@ __all__ = [
"version",
"get_include",
"get_lib",
# Python API
"Algorithm",
"AlgorithmCollection",
"CommGroup",
"GpuBuffer",
]
@@ -135,193 +124,5 @@ def deprecated(new_cls):
return decorator
class ExecutionPlanHandle:
def __init__(self, handle: _ExecutionPlanHandle):
self._handle = handle
@cached_property
def id(self) -> int:
return self._handle.id
@cached_property
def tags(self) -> set:
return frozenset(self._handle.tags)
@cached_property
def plan(self) -> ExecutionPlan:
return self._handle.plan
@cached_property
def constraints(self) -> ExecutionPlanConstraint:
return self._handle.constraints
@dataclass(frozen=True)
class ExecutionRequest:
collective: str
world_size: int
n_ranks_per_node: int
send_buffer: int
recv_buffer: int
message_size: int
hints: dict
class ExecutionPlanRegistry:
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(ExecutionPlanRegistry, cls).__new__(cls)
return cls._instance
def __init__(self):
if not hasattr(self, "_initialized"):
self._registry = _ExecutionPlanRegistry.get_instance()
self._id_map = {}
self._collective_map = {}
self._selector = None
self._initialized = True
def register_plan(self, plan: ExecutionPlanHandle):
self._id_map[plan.id] = plan
if plan.plan.collective not in self._collective_map:
self._collective_map[plan.plan.collective] = []
self._collective_map[plan.plan.collective].append(plan)
return self._instance._registry.register_plan(plan._handle)
def set_selector(self, selector):
self._selector = selector
self._instance._registry.set_selector(selector)
def set_default_selector(self, selector):
self._selector = selector
self._instance._registry.set_default_selector(selector)
def get(self, id: str) -> ExecutionPlanHandle:
return self._id_map.get(id, None)
def select(
self,
collective: str,
world_size: int,
n_ranks_per_node: int,
send_buffer: int,
recv_buffer: int,
message_size: int,
hints: dict = {},
) -> ExecutionPlanHandle:
if self._selector is None or collective not in self._collective_map:
return None
req = ExecutionRequest(
collective=collective,
world_size=world_size,
n_ranks_per_node=n_ranks_per_node,
send_buffer=send_buffer,
recv_buffer=recv_buffer,
message_size=message_size,
hints=hints,
)
return self._selector(self._collective_map[collective], req)
@classmethod
def reset_instance(cls):
if cls._instance is not None:
cls._instance._registry.clear()
cls._instance._id_map = {}
cls._instance._collective_map = {}
cls._instance._selector = None
cls._instance = None
atexit.register(ExecutionPlanRegistry.reset_instance)
_execution_plan_registry = ExecutionPlanRegistry()
def _stable_json_bytes(obj: Any) -> bytes:
return json.dumps(
obj,
sort_keys=True,
ensure_ascii=False,
separators=(",", ":"),
).encode("utf-8")
def compile(
algo,
algo_spec: AlgoSpec,
rank: int,
**kwargs,
) -> ExecutionPlanHandle:
"""Compile a MSCCL++ program from a high-level algorithm description.
Args:
algo: The high-level algorithm description (e.g., a function or class).
algo_spec (AlgoSpec): Algorithm specification containing collective type,
world size, ranks per node, instances, protocol, and other configuration.
rank (int): The rank of the current process.
**kwargs: Additional keyword arguments passed to the algorithm function.
Returns:
ExecutionPlanHandle: The compiled execution plan handle.
Raises:
ValueError: If the 'algo' argument is not callable.
"""
if not callable(algo):
raise ValueError("The 'algo' argument must be a callable (e.g., a function or class).")
prog: CollectiveProgram = algo(
algo_spec,
**kwargs,
)
source = inspect.getsource(algo)
source_hash = blake3(source.encode("utf-8")).hexdigest()
plan_id = blake3(
_stable_json_bytes(
{
"version": __version__,
"algo_name": algo_spec.name,
"collective": algo_spec.collective.name,
"tags": sorted(algo_spec.tags.items()),
"source_hash": source_hash,
"envs": {
"nranks_per_node": algo_spec.nranks_per_node,
"world_size": algo_spec.world_size,
"instances": algo_spec.instances,
"protocol": algo_spec.protocol,
},
}
)
).hexdigest()
plan_handle = _execution_plan_registry.get(plan_id)
if plan_handle is not None:
return plan_handle
plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
os.makedirs(plan_dir, exist_ok=True)
filename = f"{plan_id}.json"
plan_path = os.path.join(plan_dir, filename)
tmp_path = plan_path + f".tmp.{os.getpid()}"
if not os.path.exists(plan_path):
try:
# TODO (binyli): Each rank could generate its own execution plan separately. Doesn't need to generate whole plan.
with open(tmp_path, "w") as f:
prog.post_process_operations()
f.write(prog.to_json(indent=None, separators=(",", ":"), ensure_ascii=False))
f.flush()
os.fsync(f.fileno())
if not os.path.exists(plan_path):
os.rename(tmp_path, plan_path)
else:
os.remove(tmp_path)
except Exception:
Path(plan_path).unlink(missing_ok=True)
execution_plan = ExecutionPlan(plan_path, rank)
handle = _ExecutionPlanHandle.create(
id=plan_id,
world_size=algo_spec.world_size,
nranks_per_node=algo_spec.nranks_per_node,
plan=execution_plan,
tags=algo_spec.tags,
)
return ExecutionPlanHandle(handle)
compile: DslCompiler = DslCompiler()
compile_native: NativeCodeCompiler = NativeCodeCompiler()

View File

@@ -0,0 +1,13 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from .algorithm import *
from .comm import *
from .compiler import *
from .buffer import *
__all__ = []
__all__ += algorithm.__all__
__all__ += comm.__all__
__all__ += compiler.__all__
__all__ += buffer.__all__

View File

@@ -0,0 +1,230 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import annotations
from typing import Optional, Tuple, Dict
from functools import cached_property
from mscclpp._mscclpp import (
Algorithm as _Algorithm,
DslAlgorithm as _DslAlgorithm,
AlgorithmType as _AlgorithmType,
Communicator,
CollectiveBufferMode,
DataType,
Executor,
ExecutionPlan,
ReduceOp,
)
__all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
class Algorithm:
"""A wrapper for collective communication algorithms.
This class provides a Python interface for collective communication algorithms
such as allreduce, allgather, and reduce-scatter. Algorithms can be either
DSL-based (defined using MSCCL++ execution plans) or native (implemented in C++/CUDA).
Attributes:
name: Human-readable name of the algorithm.
collective: The collective operation this algorithm implements (e.g., "allreduce").
message_size_range: Tuple of (min_size, max_size) in bytes for valid message sizes.
tags: Dictionary of tag names to tag values for algorithm selection hints.
buffer_mode: The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY).
"""
class Constraint:
"""Constraints that define valid execution environments for the algorithm.
Args:
world_size: Required world size (number of ranks). 0 means any size.
n_ranks_per_node: Required number of ranks per node. 0 means any.
"""
def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0):
self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node)
@property
def world_size(self) -> int:
return self._constraint.worldSize
@property
def n_ranks_per_node(self) -> int:
return self._constraint.nRanksPerNode
def __init__(
self,
id: Optional[str] = None,
execution_plan: Optional[ExecutionPlan] = None,
native_handle: Optional[_Algorithm] = None,
tags: Optional[Dict[str, int]] = None,
constraint: Optional[Constraint] = None,
):
if execution_plan is not None:
self._algorithm = _DslAlgorithm(
id,
execution_plan,
tags=tags if tags is not None else {},
constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(),
)
elif native_handle is not None:
self._algorithm = native_handle
@classmethod
def create_from_native_handle(cls, handle: _Algorithm):
"""Create an Algorithm instance from a native C++ algorithm handle.
Args:
handle: The native C++ algorithm handle.
Returns:
A new Algorithm instance wrapping the native handle.
"""
return cls(
native_handle=handle,
)
@classmethod
def create_from_native_capsule(cls, obj):
"""Create an Algorithm instance from a PyCapsule object.
Args:
obj: A PyCapsule containing a native algorithm pointer.
Returns:
A new Algorithm instance wrapping the algorithm from the capsule.
"""
handle = _Algorithm.from_native_capsule(obj)
return cls(native_handle=handle)
@cached_property
def name(self) -> str:
"""The human-readable name of the algorithm."""
return self._algorithm.name
@cached_property
def collective(self) -> str:
"""The collective operation this algorithm implements (e.g., "allreduce", "allgather")."""
return self._algorithm.collective
@cached_property
def message_size_range(self) -> Tuple[int, int]:
"""The valid message size range (min_size, max_size) in bytes."""
return (self._algorithm.message_range[0], self._algorithm.message_range[1])
@cached_property
def tags(self) -> Dict[str, int]:
"""Dictionary of tag names to tag values for algorithm selection hints."""
return self._algorithm.tags
@cached_property
def buffer_mode(self) -> CollectiveBufferMode:
"""The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY)."""
return self._algorithm.buffer_mode
def is_dsl_algorithm(self) -> bool:
"""Check if this is a DSL-based algorithm.
Returns:
True if this algorithm is defined using DSL/execution plan, False otherwise.
"""
if self._algorithm.type == _AlgorithmType.DSL:
return True
return False
def is_native_algorithm(self) -> bool:
"""Check if this is a native C++/CUDA algorithm.
Returns:
True if this algorithm is implemented natively, False otherwise.
"""
if self._algorithm.type == _AlgorithmType.NATIVE:
return True
return False
def execute(
self,
comm: Communicator,
input_buffer: int,
output_buffer: int,
input_size: int,
output_size: int,
dtype: DataType,
op: ReduceOp = ReduceOp.NOP,
stream: int = 0,
executor: Optional[Executor] = None,
nblocks=0,
nthreads_per_block=0,
extras: Optional[Dict[str, int]] = None,
) -> int:
"""Execute the collective algorithm.
Args:
comm: The communicator to use.
input_buffer: Device pointer to the input buffer.
output_buffer: Device pointer to the output buffer.
input_size: Size of the input buffer in bytes.
output_size: Size of the output buffer in bytes.
dtype: Data type of the elements.
op: Reduction operation for reduce-type collectives (default: NOP).
stream: CUDA stream to execute on (default: 0).
executor: The executor for DSL algorithms (required for DSL, optional for native).
nblocks: Number of CUDA blocks (0 for auto-selection).
nthreads_per_block: Number of threads per block (0 for auto-selection).
extras: Additional algorithm-specific parameters.
Returns:
The result code (0 for success).
"""
return self._algorithm.execute(
comm,
int(input_buffer),
int(output_buffer),
input_size,
output_size,
dtype,
op,
int(stream),
executor,
nblocks,
nthreads_per_block,
extras if extras is not None else {},
)
class AlgorithmBuilder:
def __init__(self, algorithm_builder: _AlgorithmBuilder):
self._algorithm_builder = algorithm_builder
def build(self) -> Algorithm:
return Algorithm.create_from_native_handle(self._algorithm_builder.build())
class AlgorithmCollection:
def __init__(self, native_collection: _AlgorithmCollection):
self._native_collection = native_collection
self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()]
def __iter__(self):
"""Iterate over all algorithms in the collection."""
return iter(self._algorithms)
def __len__(self):
"""Return the number of algorithms in the collection."""
return len(self._algorithms)
def __getitem__(self, index: int) -> Algorithm:
"""Get an algorithm by index."""
return self._algorithms[index]
def get_by_collective(self, collective: str):
"""Get all algorithms for a specific collective operation."""
return [algo for algo in self._algorithms if algo.collective == collective]
def register_algorithm(self, collective: str, algo_name: str, algorithm: Algorithm):
"""Register an algorithm for a collective operation."""
self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
self._algorithms.append(algorithm)

View File

@@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from typing import Union, Tuple
import cupy as cp
import numpy as np
from mscclpp._mscclpp import RawGpuBuffer
__all__ = ["GpuBuffer"]
class GpuBuffer(cp.ndarray):
def __new__(
cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
):
# Check if `shape` is valid
if isinstance(shape, int):
shape = (shape,)
try:
shape = tuple(shape)
except TypeError:
raise ValueError("Shape must be a tuple-like or an integer.")
if any(s <= 0 for s in shape):
raise ValueError("Shape must be positive.")
# Create the buffer
buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)

View File

@@ -2,22 +2,19 @@
# Licensed under the MIT license.
from __future__ import annotations
from typing import Tuple, Type
from typing import Type
import cupy as cp
from ._mscclpp import (
from mscclpp._mscclpp import (
Communicator,
Connection,
connect_nvls_collective,
EndpointConfig,
Semaphore,
Host2DeviceSemaphore,
Host2HostSemaphore,
ProxyService,
RegisteredMemory,
PortChannel,
MemoryChannel,
MemoryDevice2DeviceSemaphore,
TcpBootstrap,
Transport,
TransportFlags,
@@ -27,6 +24,8 @@ import numpy as np
from mscclpp.utils import is_torch_tensor
__all__ = ["CommGroup"]
class CommGroup:
def __init__(

View File

@@ -0,0 +1,350 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import annotations
import importlib.util
import inspect
import logging
import json
import os
import subprocess
import fcntl
from typing import Any, Callable
from pathlib import Path
import pybind11
import sys
import sysconfig
from blake3 import blake3
import cupy as cp
from mscclpp._version import __version__
from .algorithm import Algorithm
from mscclpp.language.program import CollectiveProgram
from mscclpp.language.utils import AlgoSpec
from mscclpp.utils import get_device_arch
from mscclpp._mscclpp import (
ExecutionPlan,
)
logging.basicConfig(level=logging.INFO)
__all__ = ["DslCompiler", "NativeCodeCompiler"]
def _stable_json_bytes(obj: Any) -> bytes:
return json.dumps(
obj,
sort_keys=True,
ensure_ascii=False,
separators=(",", ":"),
).encode("utf-8")
class DslCompiler:
"""Compiler for MSCCL++ DSL (Domain-Specific Language) algorithms.
This compiler transforms high-level algorithm descriptions written in Python
into execution plans that can be run on GPUs. The compiled plans are cached
to disk for reuse.
The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR`
environment variable (defaults to `~/.cache/mscclpp`).
Example:
>>> compiler = DslCompiler()
>>> algo = compiler.compile(my_allreduce_algo, algo_spec, rank=0)
"""
def __init__(self):
pass
def __call__(self, algo: Callable[..., CollectiveProgram], algo_spec: AlgoSpec, rank: int, **kwds) -> Algorithm:
return self.compile(algo, algo_spec, rank, **kwds)
def compile(
self,
algo: Callable[..., CollectiveProgram],
algo_spec: AlgoSpec,
rank: int,
**kwargs,
) -> Algorithm:
"""Compile a MSCCL++ DSL program from a high-level algorithm description.
This method takes a Python function that defines a collective communication
algorithm and compiles it into an executable Algorithm. The compilation
result is cached based on a hash of the source code and algorithm specification.
Args:
algo: A callable (function or class) that takes an AlgoSpec and returns
a CollectiveProgram. This defines the communication pattern.
algo_spec: Algorithm specification containing:
- collective: The collective operation type (e.g., allreduce, allgather)
- world_size: Total number of ranks
- nranks_per_node: Number of ranks per node
- instances: Number of algorithm instances
- protocol: Communication protocol to use
- name: Human-readable algorithm name
- tags: Dictionary of tags for algorithm selection
rank: The rank of the current process (0 to world_size-1).
**kwargs: Additional keyword arguments passed to the algorithm function.
Returns:
Algorithm: The compiled algorithm ready for execution.
Raises:
ValueError: If the 'algo' argument is not callable.
Note:
Compiled execution plans are cached to disk. The cache key is computed
from the algorithm source code, specification, and MSCCL++ version.
Subsequent calls with the same inputs will reuse the cached plan.
Example:
>>> def my_ring_allreduce(spec: AlgoSpec) -> CollectiveProgram:
... # Define algorithm using MSCCL++ DSL
... ...
>>> compiler = DslCompiler()
>>> spec = AlgoSpec(collective=Collective.allreduce, world_size=8, ...)
>>> algo = compiler.compile(my_ring_allreduce, spec, rank=0)
"""
if not callable(algo):
raise ValueError("The 'algo' argument must be a callable (e.g., a function or class).")
prog: CollectiveProgram = algo(
algo_spec,
**kwargs,
)
source = inspect.getsource(algo)
source_hash = blake3(source.encode("utf-8")).hexdigest()
plan_id = blake3(
_stable_json_bytes(
{
"version": __version__,
"algo_name": algo_spec.name,
"collective": algo_spec.collective.name,
"tags": sorted(algo_spec.tags.items()),
"source_hash": source_hash,
"envs": {
"nranks_per_node": algo_spec.nranks_per_node,
"world_size": algo_spec.world_size,
"instances": algo_spec.instances,
"protocol": algo_spec.protocol,
},
}
)
).hexdigest()
plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
os.makedirs(plan_dir, exist_ok=True)
filename = f"{plan_id}.json"
plan_path = os.path.join(plan_dir, filename)
tmp_path = plan_path + f".tmp.{os.getpid()}"
if not os.path.exists(plan_path):
try:
# TODO (binyli): Each rank could generate its own execution plan separately. Doesn't need to generate whole plan.
with open(tmp_path, "w") as f:
prog.post_process_operations()
f.write(prog.to_json(indent=None, separators=(",", ":"), ensure_ascii=False))
f.flush()
os.fsync(f.fileno())
if not os.path.exists(plan_path):
os.rename(tmp_path, plan_path)
else:
os.remove(tmp_path)
except Exception:
Path(plan_path).unlink(missing_ok=True)
execution_plan = ExecutionPlan(plan_path, rank)
return Algorithm(
id=plan_id,
execution_plan=execution_plan,
constraint=Algorithm.Constraint(
world_size=algo_spec.world_size, n_ranks_per_node=algo_spec.nranks_per_node
),
tags=algo_spec.tags,
)
class NativeCodeCompiler:
"""Compiler for native CUDA/HIP algorithm implementations.
This compiler takes CUDA or HIP source files containing custom collective
algorithm kernels and compiles them into loadable Python modules using
pybind11 bindings.
The compiler automatically detects whether to use NVCC (CUDA) or HIPCC (ROCm)
based on the runtime environment. Compiled modules are cached to avoid
recompilation.
The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR`
environment variable (defaults to `~/.cache/mscclpp/native`).
Attributes:
_is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA.
_device_arch: The GPU architecture string (e.g., "sm_90" or "gfx90a").
_compiler: Path to the compiler executable (nvcc or hipcc).
Example:
>>> compiler = NativeCodeCompiler()
>>> module = compiler.compile("my_kernel", "path/to/kernel.cu")
>>> algo = module.create_algorithm()
"""
def __init__(self):
self._is_hip = cp.cuda.runtime.is_hip
self._device_arch = get_device_arch()
self._compiler = self._get_compiler()
self._default_options = ["-std=c++17", "-O3", "--shared"]
python_include = sysconfig.get_path("include")
pybind11_include = pybind11.get_include()
self._default_options += [f"-I{python_include}", f"-I{pybind11_include}"]
python_lib = f"-lpython{sys.version_info.major}.{sys.version_info.minor}"
self._default_options.append(python_lib)
self._lib_home = os.path.abspath(os.path.dirname(__file__))
if not self._is_hip:
# Format: -gencode=arch=compute_90,code=sm_90
compute_arch = self._device_arch.replace("sm_", "compute_")
arch_flag = f"-gencode=arch={compute_arch},code={self._device_arch}"
self._default_options.append(arch_flag)
self._default_options += ["--compiler-options", "-fPIC"]
self._default_options += ["--linker-options", f"-rpath,{self._lib_home}/lib"]
else:
# Format for HIP: --offload-arch=gfx90a
arch_flag = f"--offload-arch={self._device_arch}"
self._default_options.append(arch_flag)
self._default_options += ["-fPIC"]
self._default_options += ["-D__HIP_PLATFORM_AMD__"]
self._default_options += [f"-Wl,-rpath,{self._lib_home}/lib"]
self._default_options = self._default_options + [
"-I" + os.path.join(self._lib_home, "include"),
"-L" + os.path.join(self._lib_home, "lib"),
"-lmscclpp",
]
cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native")
self._cache_dir = Path(cache_root)
self._cache_dir.mkdir(parents=True, exist_ok=True)
def _get_compiler(self) -> str:
"""Get the path to the appropriate compiler.
Returns:
Path to nvcc (CUDA) or hipcc (ROCm) compiler.
"""
if self._is_hip:
rocm_home = os.environ.get("ROCM_HOME")
return os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
else:
cuda_home = os.environ.get("CUDA_HOME")
return os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
def get_arch(self):
"""Get the target GPU architecture.
Returns:
str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD).
"""
return self._device_arch
def __call__(self, name: str, file: str, **kwds):
return self.compile(name, file, **kwds)
def compile(self, name: str, file: str):
"""Compile a native CUDA/HIP source file into a Python module.
This method compiles a CUDA (.cu) or HIP source file containing custom
collective algorithm kernels into a dynamically loadable Python module.
The module is expected to use pybind11 bindings to expose algorithm
creation functions.
Compilation results are cached based on a hash of the source code,
compiler options, and GPU architecture. Subsequent calls with unchanged
inputs will return the cached module.
Args:
name: The name of the Python module to create. This will be the
module name used for importing (e.g., `import name`).
file: Path to the CUDA/HIP source file to compile.
Returns:
module: The compiled and loaded Python module containing the
algorithm implementation.
Raises:
FileNotFoundError: If the specified source file does not exist.
RuntimeError: If compilation fails (compiler not found, syntax errors, etc.).
ImportError: If the compiled module cannot be loaded.
Note:
- The source file should include pybind11 bindings to expose functions.
- MSCCLPP headers are automatically included in the compilation.
- The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native).
- File locking is used to prevent race conditions during parallel compilation.
Example:
>>> compiler = NativeCodeCompiler()
>>> # Compile a custom allreduce kernel
>>> module = compiler.compile("my_allreduce", "kernels/allreduce.cu")
>>> # Use the module to create an algorithm
>>> algo = module.create_allreduce_algorithm(comm, buffer, size)
"""
if not os.path.isfile(file):
raise FileNotFoundError(f"The specified source file does not exist: {file}")
with open(file, "rb") as source_file:
source_bytes = source_file.read()
source_hash = blake3(source_bytes).hexdigest()
cache_key = blake3(
_stable_json_bytes(
{
"version": __version__,
"source_hash": source_hash,
"compiler": self._compiler,
"options": self._default_options,
"arch": self._device_arch,
}
)
).hexdigest()
output_file = self._cache_dir / f"{name}-{cache_key}.so"
lock_file = output_file.with_suffix(output_file.suffix + ".lock")
with open(lock_file, "w") as lock_handle:
fcntl.flock(lock_handle, fcntl.LOCK_EX)
if not output_file.exists():
tmp_file = output_file.with_suffix(output_file.suffix + f".tmp.{os.getpid()}")
compile_command = [self._compiler] + self._default_options + ["-o", str(tmp_file), file]
try:
subprocess.run(compile_command, check=True, capture_output=True, text=True)
os.replace(tmp_file, output_file)
except FileNotFoundError as e:
Path(tmp_file).unlink(missing_ok=True)
raise RuntimeError(
f"Compiler '{self._compiler}' not found. Make sure it's installed and in PATH."
) from e
except subprocess.CalledProcessError as e:
Path(tmp_file).unlink(missing_ok=True)
raise RuntimeError(
f"Compilation failed with return code {e.returncode}.\n"
f"Command: {' '.join(compile_command)}\n"
f"Stdout: {e.stdout}\n"
f"Stderr: {e.stderr}"
) from e
module_name = name
existing_module = sys.modules.get(module_name)
if existing_module and getattr(existing_module, "__mscclpp_cache_key__", None) == cache_key:
return existing_module
spec = importlib.util.spec_from_file_location(module_name, output_file)
if spec is None or spec.loader is None:
raise ImportError(f"Could not load module '{name}' from '{output_file}'")
module = importlib.util.module_from_spec(spec)
module.__mscclpp_cache_key__ = cache_key
sys.modules[module_name] = module
spec.loader.exec_module(module)
logging.debug(f"Successfully compiled and loaded module '{name}' from '{output_file}'")
return module

View File

@@ -1,6 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from mscclpp.language.default_algos.allreduce_2nodes import allreduce_2nodes
from mscclpp.default_algos.allreduce_2nodes import allreduce_2nodes
__all__ = ["allreduce_2nodes"]

View File

@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from .algorithm_collection_builder import *
__all__ = algorithm_collection_builder.__all__

View File

@@ -0,0 +1,60 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import annotations
from typing import Union
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
import atexit
from mscclpp._mscclpp import (
AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
)
__all__ = ["AlgorithmCollectionBuilder"]
class AlgorithmCollectionBuilder:
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(AlgorithmCollectionBuilder, cls).__new__(cls)
return cls._instance
@classmethod
def reset(cls):
if cls._instance is not None:
_AlgorithmCollectionBuilder.reset()
cls._instance = None
def __init__(self):
if not hasattr(self, "_initialized"):
self._builder = _AlgorithmCollectionBuilder.get_instance()
self._initialized = True
def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
if isinstance(algorithm_builder, AlgorithmBuilder):
self._builder.add_algorithm_builder(algorithm_builder._algorithm_builder)
return
if isinstance(algorithm_builder, Algorithm):
if algorithm_builder.is_dsl_algorithm():
self._builder.add_dsl_algorithm_builder(algorithm_builder._algorithm)
return
raise ValueError("The 'algorithm_builder' argument must be an instance of AlgorithmBuilder or DSL Algorithm.")
def set_algorithm_selector(self, selector):
self._builder.set_algorithm_selector(selector)
def set_fallback_algorithm_selector(self, selector):
self._builder.set_fallback_algorithm_selector(selector)
def build(self) -> AlgorithmCollection:
collection = self._builder.build()
return AlgorithmCollection(collection)
def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
return AlgorithmCollection(native_collection)
atexit.register(AlgorithmCollectionBuilder.reset)

View File

@@ -2,3 +2,5 @@
# Licensed under the MIT License.
"""MSCCL++ DSL."""
from .utils import *

View File

@@ -5,6 +5,8 @@ from enum import Enum
from dataclasses import dataclass, field
from mscclpp.language.collectives import Collective
__all__ = ["AlgoSpec", "ReplicationPolicy"]
class ReplicationPolicy(Enum):
interleaved = "interleaved"

View File

@@ -6,11 +6,12 @@ import os
import struct
import subprocess
import tempfile
from typing import Any, Type, Union, Tuple
from typing import Any, Type, Union
import cupy as cp
import numpy as np
from ._mscclpp import RawGpuBuffer
from mscclpp._mscclpp import DataType
try:
import torch
@@ -22,6 +23,22 @@ except ImportError:
torchTensor = Type[Any]
__all__ = [
"Kernel",
"KernelBuilder",
"pack",
"get_device_arch",
"torch_dtype_to_mscclpp_dtype",
]
def get_device_arch() -> str:
if cp.cuda.runtime.is_hip:
return cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8")
else:
return f"sm_{cp.cuda.Device().compute_capability}"
class Kernel:
CU_LAUNCH_PARAM_BUFFER_POINTER = 0x01
CU_LAUNCH_PARAM_BUFFER_SIZE = 0x02
@@ -86,7 +103,8 @@ class KernelBuilder:
mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp")
include_dir = os.path.join(mscclpp_home, "include")
if not cp.cuda.runtime.is_hip:
compute_capability = cp.cuda.Device().compute_capability
arch = get_device_arch()
compute_capability = arch.replace("sm_", "")
cuda_home = os.environ.get("CUDA_HOME")
nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
command = [
@@ -104,9 +122,7 @@ class KernelBuilder:
]
else:
# the gcn arch name is like "gfx942:sramecc+:xnack-"
gcn_arch = (
cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8").split(":")[0]
)
gcn_arch = get_device_arch()
rocm_home = os.environ.get("ROCM_HOME")
hipcc = os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
command = [
@@ -138,25 +154,6 @@ class KernelBuilder:
self._tempdir.cleanup()
class GpuBuffer(cp.ndarray):
def __new__(
cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
):
# Check if `shape` is valid
if isinstance(shape, int):
shape = (shape,)
try:
shape = tuple(shape)
except TypeError:
raise ValueError("Shape must be a tuple-like or an integer.")
if any(s <= 0 for s in shape):
raise ValueError("Shape must be positive.")
# Create the buffer
buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
def pack(*args):
res = b""
for arg in list(args):
@@ -182,3 +179,18 @@ def pack(*args):
def is_torch_tensor(tensor: Any) -> bool:
return _use_torch and isinstance(tensor, torchTensor)
def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
if not _use_torch:
raise RuntimeError("PyTorch is not available.")
if dtype == torch.float16:
return DataType.float16
elif dtype == torch.float32:
return DataType.float32
elif dtype == torch.int32:
return DataType.int32
elif dtype == torch.bfloat16:
return DataType.bfloat16
else:
raise ValueError(f"Unknown data type: {dtype}")

View File

@@ -1 +1,4 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5

View File

@@ -13,9 +13,7 @@ from mscclpp_op import (
from nccl_op import NcclAllReduce
from mpi4py import MPI
import cupy.cuda.nccl as nccl
import mscclpp.comm as mscclpp_comm
from mscclpp import ProxyService, is_nvls_supported
from mscclpp.utils import GpuBuffer
from mscclpp import ProxyService, is_nvls_supported, CommGroup, GpuBuffer
from prettytable import PrettyTable
import netifaces as ni
import ipaddress
@@ -160,9 +158,7 @@ def find_best_config(mscclpp_call, niter):
return best_config, best_time
def run_benchmark(
mscclpp_group: mscclpp_comm.CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int
):
def run_benchmark(mscclpp_group: CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int):
memory = GpuBuffer(nelem, dtype=data_type)
memory_out = GpuBuffer(nelem, dtype=data_type)
cp.cuda.runtime.deviceSynchronize()
@@ -259,9 +255,7 @@ if __name__ == "__main__":
network_interface, my_ip = get_netinterface_info()
root_ip = MPI.COMM_WORLD.bcast(my_ip, root=0)
ifIpPortTrio = network_interface + ":" + root_ip + ":50000" # some random port
mscclpp_group = mscclpp_comm.CommGroup(
interfaceIpPortTrio=ifIpPortTrio, rank=MPI.COMM_WORLD.rank, size=MPI.COMM_WORLD.size
)
mscclpp_group = CommGroup(interfaceIpPortTrio=ifIpPortTrio, rank=MPI.COMM_WORLD.rank, size=MPI.COMM_WORLD.size)
# create a NcclComm
if MPI.COMM_WORLD.rank == 0:

View File

@@ -1,9 +1,12 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import cupy as cp
import ctypes
from mscclpp import Transport, ProxyService, MemoryDevice2DeviceSemaphore
import mscclpp.comm as mscclpp_comm
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
from mscclpp import CommGroup, GpuBuffer
from mscclpp.utils import KernelBuilder, pack
IB_TRANSPORTS = [
Transport.IB0,
@@ -31,7 +34,7 @@ def type_to_str(dtype):
class MscclppAllReduce1:
def __init__(
self,
group: mscclpp_comm.CommGroup,
group: CommGroup,
memory: cp.ndarray,
read_only: int = 1,
block_size: int = 1024,
@@ -97,7 +100,7 @@ class MscclppAllReduce1:
class MscclppAllReduce2:
def __init__(
self,
group: mscclpp_comm.CommGroup,
group: CommGroup,
memory: cp.ndarray,
memory_out: cp.ndarray,
block_size: int = 512,
@@ -164,7 +167,7 @@ class MscclppAllReduce2:
class MscclppAllReduce3:
def __init__(
self,
group: mscclpp_comm.CommGroup,
group: CommGroup,
memory: cp.ndarray,
proxy_service: ProxyService,
block_size: int = 1024,
@@ -234,7 +237,7 @@ class MscclppAllReduce3:
class MscclppAllReduce4:
def __init__(
self,
group: mscclpp_comm.CommGroup,
group: CommGroup,
memory: cp.ndarray,
nranks_per_node: int,
proxy_service: ProxyService,
@@ -335,7 +338,7 @@ class MscclppAllReduce4:
class MscclppAllReduce5:
def __init__(
self,
group: mscclpp_comm.CommGroup,
group: CommGroup,
memory: cp.ndarray,
memory_out: cp.ndarray,
nranks_per_node: int,
@@ -428,7 +431,7 @@ class MscclppAllReduce5:
class MscclppAllReduce6:
def __init__(
self,
group: mscclpp_comm.CommGroup,
group: CommGroup,
nelem: int,
memory_dtype: cp.dtype,
block_size: int = 1024,

View File

@@ -1,3 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import cupy.cuda.nccl as nccl
from mpi4py import MPI
import cupy as cp

View File

@@ -6,4 +6,5 @@ pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
blake3
blake3
pybind11

View File

@@ -6,4 +6,5 @@ pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
blake3
blake3
pybind11

View File

@@ -10,7 +10,7 @@ from mscclpp import (
npkit,
env,
)
import mscclpp.comm as mscclpp_comm
from mscclpp import CommGroup, GpuBuffer
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
import os
import struct
@@ -180,7 +180,7 @@ def main(
n_iters: int = 10,
n_graph_iters: int = 10,
):
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
mscclpp_group = CommGroup(MPI.COMM_WORLD)
cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
executor = Executor(mscclpp_group.communicator)
npkit_dump_dir = env().npkit_dump_dir

View File

@@ -13,7 +13,6 @@ import pytest
from mscclpp import (
ErrorCode,
Error,
DataType,
EndpointConfig,
ExecutionPlan,
@@ -31,8 +30,8 @@ from mscclpp import (
Device,
DeviceType,
)
import mscclpp.comm as mscclpp_comm
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
from mscclpp import CommGroup, GpuBuffer
from mscclpp.utils import KernelBuilder, pack
from ._cpp import _ext
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
@@ -75,7 +74,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
# ranks are on different nodes
pytest.skip("this case is not supported as localhost will be different for different nodes")
group = mscclpp_comm.CommGroup(mpi_group.comm, ifIpPortTrio)
group = CommGroup(mpi_group.comm, ifIpPortTrio)
nelem = 1024
memory = np.zeros(nelem, dtype=np.int32)
@@ -141,7 +140,7 @@ def test_bootstrap_init_gil_release(mpi_group: MpiGroup):
mpi_group.comm.barrier()
def create_connection(group: mscclpp_comm.CommGroup, connection_type: str):
def create_connection(group: CommGroup, connection_type: str):
if connection_type == "NVLS":
all_ranks = list(range(group.nranks))
tran = Transport.CudaIpc
@@ -163,7 +162,7 @@ def create_connection(group: mscclpp_comm.CommGroup, connection_type: str):
def create_group_and_connection(mpi_group: MpiGroup, connection_type: str):
if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
pytest.skip("cannot use nvlink/nvls for cross node")
group = mscclpp_comm.CommGroup(mpi_group.comm)
group = CommGroup(mpi_group.comm)
try:
connection = create_connection(group, connection_type)
except Error as e:
@@ -282,7 +281,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str,
@parametrize_mpi_groups(2, 4, 8, 16)
def test_h2h_semaphores(mpi_group: MpiGroup):
group = mscclpp_comm.CommGroup(mpi_group.comm)
group = CommGroup(mpi_group.comm)
tran = group.my_ib_device(group.my_rank % 8)
endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
remote_nghrs = list(range(group.nranks))
@@ -302,7 +301,7 @@ def test_h2h_semaphores(mpi_group: MpiGroup):
@parametrize_mpi_groups(2, 4, 8, 16)
def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
group = mscclpp_comm.CommGroup(mpi_group.comm)
group = CommGroup(mpi_group.comm)
tran = group.my_ib_device(group.my_rank % 8)
endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
remote_nghrs = list(range(group.nranks))
@@ -339,7 +338,7 @@ def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
def test_nvls_connection(mpi_group: MpiGroup):
if all_ranks_on_the_same_node(mpi_group) is False:
pytest.skip("cannot use nvls for cross node")
group = mscclpp_comm.CommGroup(mpi_group.comm)
group = CommGroup(mpi_group.comm)
all_ranks = list(range(group.nranks))
nvls_connection = group.make_connection(all_ranks, Transport.CudaIpc, use_switch=True)
memory1 = GpuBuffer(2**29, cp.int8)
@@ -659,7 +658,7 @@ def test_executor(mpi_group: MpiGroup, filename: str):
if all_ranks_on_the_same_node(mpi_group) is False:
pytest.skip("algo not support cross node")
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
mscclpp_group = CommGroup(mpi_group.comm)
executor = Executor(mscclpp_group.communicator)
npkit_dump_dir = env().npkit_dump_dir
if npkit_dump_dir != "":

View File

@@ -1,6 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu)
target_sources(mscclpp_obj PRIVATE ${SOURCES})
target_include_directories(mscclpp_obj PRIVATE include)
add_subdirectory(core)
add_subdirectory(ext)

View File

@@ -1,108 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <mscclpp/algorithm.hpp>
namespace mscclpp {
class Algorithm::Impl {
public:
Impl(std::string name, std::string collective, Algorithm::InitFunc initFunc, Algorithm::KernelFunc kernelFunc,
Algorithm::ContextInitFunc contextInitFunc, Algorithm::ContextKeyGenFunc contextKeyGenFunc)
: name_(name),
collective_(collective),
initFunc_(initFunc),
kernelLaunchFunc_(kernelFunc),
contextInitFunc_(contextInitFunc),
contextKeyGenFunc_(contextKeyGenFunc) {}
int launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count, DataType dtype,
cudaStream_t stream, std::unordered_map<std::string, std::shared_ptr<void>>& extras);
std::string name_;
std::string collective_;
Algorithm::InitFunc initFunc_;
Algorithm::KernelFunc kernelLaunchFunc_;
Algorithm::ContextInitFunc contextInitFunc_;
Algorithm::ContextKeyGenFunc contextKeyGenFunc_;
bool initialized_ = false;
std::unordered_map<AlgorithmCtxKey, std::shared_ptr<AlgorithmCtx>> contexts_;
};
int Algorithm::Impl::launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
if (!initialized_) {
initFunc_(comm, extras);
initialized_ = true;
}
AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, count, dtype);
auto it = contexts_.find(ctxKey);
if (it == contexts_.end()) {
auto ctx = contextInitFunc_(comm, input, output, count, dtype);
contexts_[ctxKey] = ctx;
}
return kernelLaunchFunc_(contexts_[ctxKey], input, output, count, dtype, stream, extras);
}
Algorithm::Algorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc)
: impl_(std::make_shared<Impl>(name, collective, initFunc, kernelFunc, contextInitFunc, contextKeyGenFunc)) {}
int Algorithm::launch(std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t count,
DataType dtype, cudaStream_t stream,
std::unordered_map<std::string, std::shared_ptr<void>>& extras) {
return this->impl_->launch(comm, input, output, count, dtype, stream, extras);
}
bool Algorithm::isEmpty() { return !impl_; }
std::string Algorithm::name() const { return impl_->name_; }
std::string Algorithm::collective() const { return impl_->collective_; }
void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
Algorithm algorithm) {
this->algoMapByCollective_[collective][algoName] = algorithm;
}
Algorithm AlgorithmCollection::selectAlgorithm(const std::string& collective, const void* input, void* output,
size_t messageSize, DataType dtype, int nRanksPerNode, int worldSize) {
Algorithm algo;
if (algoSelector_) {
algo = algoSelector_(algoMapByCollective_, collective, input, output, messageSize, dtype, nRanksPerNode, worldSize);
}
if (algo.isEmpty()) {
algo = fallbackAlgoSelector_(algoMapByCollective_, collective, input, output, messageSize, dtype, nRanksPerNode,
worldSize);
}
return algo;
}
std::shared_ptr<AlgorithmCollectionBuilder> AlgorithmCollectionBuilder::getInstance() {
static std::shared_ptr<AlgorithmCollectionBuilder> instance(new AlgorithmCollectionBuilder());
return instance;
}
void AlgorithmCollectionBuilder::addAlgorithmBuilder(std::shared_ptr<AlgorithmBuilder> builder) {
this->algoBuilders_.push_back(builder);
}
void AlgorithmCollectionBuilder::setAlgorithmSelector(AlgoSelectFunc selector) { algoSelector_ = selector; }
void AlgorithmCollectionBuilder::setFallbackAlgorithmSelector(AlgoSelectFunc selector) {
fallbackAlgoSelector_ = selector;
}
std::shared_ptr<AlgorithmCollection> AlgorithmCollectionBuilder::build() {
auto collection = std::make_shared<AlgorithmCollection>();
for (const auto& builder : algoBuilders_) {
auto algo = builder->build();
collection->registerAlgorithm(algo.collective(), algo.name(), algo);
}
collection->algoSelector_ = algoSelector_;
collection->fallbackAlgoSelector_ = fallbackAlgoSelector_;
return collection;
}
} // namespace mscclpp

64
src/core/CMakeLists.txt Normal file
View File

@@ -0,0 +1,64 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu)
# Handle ROCm specific source property
if(MSCCLPP_USE_ROCM)
file(GLOB_RECURSE CU_SOURCES *.cu)
set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
endif()
add_library(mscclpp_obj OBJECT ${SOURCES})
target_include_directories(mscclpp_obj
SYSTEM PRIVATE
${GPU_INCLUDE_DIRS}
${NUMA_INCLUDE_DIRS})
target_include_directories(mscclpp_obj PRIVATE
include
${PROJECT_SOURCE_DIR}/include
)
target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl)
if(MSCCLPP_USE_IB)
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
endif()
set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
if(MSCCLPP_USE_CUDA)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_CUDA)
elseif(MSCCLPP_USE_ROCM)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_ROCM)
foreach(arch ${MSCCLPP_GPU_ARCHS})
target_compile_options(mscclpp_obj PRIVATE --offload-arch=${arch})
endforeach()
endif()
if(MSCCLPP_ENABLE_TRACE)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_ENABLE_TRACE)
endif()
if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()
# libmscclpp
add_library(mscclpp SHARED)
target_link_libraries(mscclpp PUBLIC mscclpp_obj)
set_target_properties(mscclpp PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
add_library(mscclpp_static STATIC)
target_link_libraries(mscclpp_static PUBLIC mscclpp_obj)
set_target_properties(mscclpp_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
install(TARGETS mscclpp
LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
install(TARGETS mscclpp_static
ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)

201
src/core/algorithm.cc Normal file
View File

@@ -0,0 +1,201 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include <filesystem>
#include <mscclpp/algorithm.hpp>
#include "logger.hpp"
namespace mscclpp {
CollectiveBufferMode CollectiveRequest::bufferMode() const {
if (inputBuffer == outputBuffer) return CollectiveBufferMode::InPlace;
if (collective == "allgather") {
size_t rankOffset = rank * messageSize;
const char* expectedInput = static_cast<const char*>(outputBuffer) + rankOffset;
if (static_cast<const void*>(expectedInput) == inputBuffer) {
return CollectiveBufferMode::InPlace;
}
return CollectiveBufferMode::OutOfPlace;
}
return CollectiveBufferMode::OutOfPlace;
}
NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitFunc initFunc, KernelFunc kernelFunc,
ContextInitFunc contextInitFunc, ContextKeyGenFunc contextKeyGenFunc,
size_t minMessageSize, size_t maxMessageSize, CollectiveBufferMode bufferMode,
std::unordered_map<std::string, uint64_t> tags, Constraint constraint)
: name_(name),
collective_(collective),
initFunc_(initFunc),
kernelLaunchFunc_(kernelFunc),
contextInitFunc_(contextInitFunc),
contextKeyGenFunc_(contextKeyGenFunc),
minMessageSize_(minMessageSize),
maxMessageSize_(maxMessageSize),
bufferMode_(bufferMode),
tags_(tags),
constraint_(constraint) {}
CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>& extras) {
if (!initialized_) {
initFunc_(comm);
initialized_ = true;
}
AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype);
auto it = contexts_.find(ctxKey);
if (it == contexts_.end()) {
auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
contexts_[ctxKey] = ctx;
}
return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
nThreadsPerBlock, extras);
}
const std::string& NativeAlgorithm::name() const { return name_; }
const std::string& NativeAlgorithm::collective() const { return collective_; }
const std::pair<size_t, size_t>& NativeAlgorithm::messageRange() const {
static std::pair<size_t, size_t> range;
range = {minMessageSize_, maxMessageSize_};
return range;
}
const std::unordered_map<std::string, uint64_t>& NativeAlgorithm::tags() const { return tags_; }
const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferMode_; }
Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
void NativeAlgorithm::reset() {
contexts_.clear();
initialized_ = false;
}
void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
std::shared_ptr<Algorithm> algorithm) {
this->algoMapByCollective_[collective][algoName] = algorithm;
}
std::shared_ptr<Algorithm> AlgorithmCollection::selectAlgorithm(const CollectiveRequest& request) {
std::shared_ptr<Algorithm> algo;
if (!algoSelector_ && !fallbackAlgoSelector_) {
THROW(ALGO, Error, ErrorCode::InvalidUsage, "No algorithm selector is set in AlgorithmCollection.");
}
if (algoSelector_) {
algo = algoSelector_(algoMapByCollective_, request);
}
if (!algo) {
algo = fallbackAlgoSelector_(algoMapByCollective_, request);
}
return algo;
}
void AlgorithmCollection::extend(const AlgorithmCollection& other) {
for (const auto& [collective, algoMap] : other.algoMapByCollective_) {
for (const auto& [algoName, algorithm] : algoMap) {
this->registerAlgorithm(collective, algoName, algorithm);
}
}
}
void AlgorithmCollection::setSelectors(AlgoSelectFunc algoSelector, AlgoSelectFunc fallbackAlgoSelector) {
algoSelector_ = algoSelector;
fallbackAlgoSelector_ = fallbackAlgoSelector;
}
std::vector<std::shared_ptr<Algorithm>> AlgorithmCollection::getAllAlgorithms() const {
std::vector<std::shared_ptr<Algorithm>> allAlgos;
for (const auto& [collective, algoMap] : algoMapByCollective_) {
for (const auto& [algoName, algorithm] : algoMap) {
allAlgos.push_back(algorithm);
}
}
return allAlgos;
}
std::unordered_map<std::string, std::shared_ptr<Algorithm>> AlgorithmCollection::getAlgorithmsByCollective(
const std::string& collective) const {
auto it = algoMapByCollective_.find(collective);
if (it != algoMapByCollective_.end()) {
return it->second;
} else {
return {};
}
}
DslAlgorithm::DslAlgorithm(std::string id, ExecutionPlan plan, std::unordered_map<std::string, uint64_t> tags,
Constraint constraint)
: plan_(plan), id_(id), tags_(tags), constraint_(constraint) {}
const std::string& DslAlgorithm::name() const { return plan_.name(); }
const std::string& DslAlgorithm::collective() const { return plan_.collective(); }
const std::pair<size_t, size_t>& DslAlgorithm::messageRange() const {
static std::pair<size_t, size_t> range;
range = {plan_.minMessageSize(), plan_.maxMessageSize()};
return range;
}
const std::unordered_map<std::string, uint64_t>& DslAlgorithm::tags() const { return tags_; }
const CollectiveBufferMode& DslAlgorithm::bufferMode() const {
// TODO: need to fix
static CollectiveBufferMode mode =
plan_.isInPlace() ? CollectiveBufferMode::InPlace : CollectiveBufferMode::OutOfPlace;
return mode;
}
Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
std::shared_ptr<Executor> executor, int, int,
const std::unordered_map<std::string, uintptr_t>&) {
if (!executor) {
THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
}
int rank = comm->bootstrap()->getRank();
switch (dtype) {
case DataType::FLOAT16:
executor->execute(rank, (half*)input, (half*)output, inputSize, outputSize, DataType::FLOAT16, plan_, stream);
break;
case DataType::FLOAT32:
executor->execute(rank, (float*)input, (float*)output, inputSize, outputSize, DataType::FLOAT32, plan_, stream);
break;
case DataType::BFLOAT16:
executor->execute(rank, (__bfloat16*)input, (__bfloat16*)output, inputSize, outputSize, DataType::BFLOAT16, plan_,
stream);
break;
#if defined(__FP8_TYPES_EXIST__)
case DataType::FP8_E4M3:
executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FP8_E4M3, plan_,
stream);
break;
case DataType::FP8_E5M2:
executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FP8_E5M2, plan_,
stream);
break;
#endif
case DataType::INT32:
case DataType::UINT32:
executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
break;
default:
WARN(ALGO, "Unsupported data type: ", static_cast<int>(dtype), " in DslAlgorithm");
return CommResult::CommInvalidArgument;
}
return CommResult::CommSuccess;
}
std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
// TODO: implement this
void DslAlgorithm::reset() {}
} // namespace mscclpp

View File

@@ -67,6 +67,10 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
);
#endif
break;
case DataType::FP8_E4M3:
case DataType::FP8_E5M2:
// FP8 is not supported in CUDA execution kernel.
break;
}
}

View File

@@ -15,20 +15,6 @@
namespace {
static const std::vector<mscclpp::AlgoConfig> defaultAlgoConfigs = {
{"allreduce_2nodes_1K_64K.json", "allreduce", 8, 16, {{"default", 1}}},
{"allreduce_2nodes_128K_2M.json", "allreduce", 8, 16, {{"default", 1}}}};
std::string simpleHash(const std::string& input) {
std::hash<std::string> hasher;
size_t hashValue = hasher(input);
std::ostringstream oss;
oss << std::hex << hashValue;
return oss.str();
}
std::string generateFileId(const std::string& filePath) { return simpleHash(filePath); }
template <typename T, typename Predicate>
std::vector<T> filter(const std::vector<T>& vec, Predicate pred) {
std::vector<T> filtered;
@@ -699,9 +685,9 @@ void ExecutionPlan::Impl::operationsReset() { this->operations.clear(); }
ExecutionPlan::ExecutionPlan(const std::string& planPath, int rank) : impl_(std::make_shared<Impl>(planPath, rank)) {}
std::string ExecutionPlan::name() const { return this->impl_->name; }
const std::string& ExecutionPlan::name() const { return this->impl_->name; }
std::string ExecutionPlan::collective() const { return this->impl_->collective; }
const std::string& ExecutionPlan::collective() const { return this->impl_->collective; }
size_t ExecutionPlan::minMessageSize() const { return this->impl_->minMessageSize; }
@@ -709,149 +695,4 @@ size_t ExecutionPlan::maxMessageSize() const { return this->impl_->maxMessageSiz
bool ExecutionPlan::isInPlace() const { return this->impl_->isInPlace; }
void ExecutionPlanRegistry::Impl::setSelector(ExecutionPlanSelector selector) { selector_ = selector; }
void ExecutionPlanRegistry::Impl::setDefaultSelector(ExecutionPlanSelector selector) { defaultSelector_ = selector; }
std::shared_ptr<ExecutionPlanHandle> ExecutionPlanRegistry::Impl::select(const ExecutionRequest& request) {
std::vector<std::shared_ptr<ExecutionPlanHandle>> plans;
for (auto plan : planMap_[request.collective]) {
if (plan->match(request)) {
plans.push_back(plan);
}
}
if (selector_) {
auto plan = selector_(plans, request);
if (plan) {
return plan;
}
}
if (defaultSelector_) {
auto plan = defaultSelector_(plans, request);
if (plan) {
return plan;
}
}
INFO(MSCCLPP_EXECUTOR, "No suitable execution plan found for collective: %s", request.collective.c_str());
return nullptr;
}
void ExecutionPlanRegistry::Impl::registerPlan(const std::shared_ptr<ExecutionPlanHandle> planHandle) {
if (!planHandle) {
throw Error("Cannot register a null plan", ErrorCode::ExecutorError);
}
planMap_[planHandle->plan->collective()].push_back(planHandle);
idMap_[planHandle->id] = planHandle;
}
void ExecutionPlanRegistry::Impl::loadDefaultPlans(int rank) {
std::string planDir = mscclpp::env()->executionPlanDir;
if (!std::filesystem::exists(planDir)) {
INFO(MSCCLPP_EXECUTOR, "Plan directory does not exist: %s", planDir.c_str());
return;
}
for (const auto& config : defaultAlgoConfigs) {
std::string planPath = planDir + "/" + config.filename;
INFO(MSCCLPP_EXECUTOR, "Loading plan: %s", planPath.c_str());
if (!std::filesystem::exists(planPath)) {
INFO(MSCCLPP_EXECUTOR, "Plan file does not exist: %s", planPath.c_str());
continue;
}
std::string planId = generateFileId(planPath);
if (idMap_.find(planId) != idMap_.end()) {
INFO(MSCCLPP_EXECUTOR, "Plan already registered: %s", planId.c_str());
continue;
}
try {
auto executionPlan = std::make_shared<ExecutionPlan>(planPath, rank);
auto handle =
ExecutionPlanHandle::create(planId, config.worldSize, config.nRanksPerNode, executionPlan, config.tags);
registerPlan(handle);
INFO(MSCCLPP_EXECUTOR, "Successfully loaded plan: %s for collective: %s", planId.c_str(),
config.collective.c_str());
} catch (const std::exception& e) {
WARN("Failed to load plan %s: %s", planPath.c_str(), e.what());
}
}
}
std::shared_ptr<ExecutionPlanRegistry> ExecutionPlanRegistry::getInstance() {
static std::shared_ptr<ExecutionPlanRegistry> instance(new ExecutionPlanRegistry);
return instance;
}
void ExecutionPlanRegistry::registerPlan(const std::shared_ptr<ExecutionPlanHandle> planHandle) {
impl_->registerPlan(planHandle);
}
void ExecutionPlanRegistry::setSelector(ExecutionPlanSelector selector) { impl_->setSelector(selector); }
void ExecutionPlanRegistry::setDefaultSelector(ExecutionPlanSelector selector) { impl_->setDefaultSelector(selector); }
std::shared_ptr<ExecutionPlanHandle> ExecutionPlanRegistry::select(
const std::string& collective, int worldSize, int nRanksPerNode, int rank, const void* sendBuffer, void* recvBuffer,
size_t messageSize, const std::unordered_map<std::string, std::vector<uint64_t>>& hints) {
ExecutionRequest request{worldSize, nRanksPerNode, rank, sendBuffer, recvBuffer, messageSize, collective, hints};
return impl_->select(request);
}
std::vector<std::shared_ptr<ExecutionPlanHandle>> ExecutionPlanRegistry::getPlans(const std::string& collective) {
if (impl_->planMap_.find(collective) != impl_->planMap_.end()) {
return impl_->planMap_[collective];
}
return {};
}
std::shared_ptr<ExecutionPlanHandle> ExecutionPlanRegistry::get(const std::string& id) {
if (impl_->idMap_.find(id) != impl_->idMap_.end()) {
return impl_->idMap_[id];
}
return nullptr;
}
ExecutionPlanRegistry::ExecutionPlanRegistry() : impl_(std::make_unique<Impl>()) {}
ExecutionPlanRegistry::~ExecutionPlanRegistry() = default;
void ExecutionPlanRegistry::clear() {
impl_->planMap_.clear();
impl_->idMap_.clear();
impl_->selector_ = nullptr;
impl_->defaultSelector_ = nullptr;
}
void ExecutionPlanRegistry::loadDefaultPlans(int rank) { impl_->loadDefaultPlans(rank); }
bool ExecutionRequest::isInPlace() const {
if (inputBuffer == outputBuffer) return true;
if (collective == "allgather") {
size_t rankOffset = rank * messageSize;
const char* expectedInput = static_cast<const char*>(outputBuffer) + rankOffset;
return static_cast<const void*>(expectedInput) == inputBuffer;
}
return false;
}
std::shared_ptr<ExecutionPlanHandle> ExecutionPlanHandle::create(
const std::string& id, int worldSize, int nRanksPerNode, std::shared_ptr<ExecutionPlan> plan,
const std::unordered_map<std::string, uint64_t>& tags) {
std::shared_ptr<ExecutionPlanHandle> handle(new ExecutionPlanHandle{id, {worldSize, nRanksPerNode}, plan, tags});
return handle;
}
bool ExecutionPlanHandle::match(const ExecutionRequest& request) {
bool worldSizeMatch = constraint.worldSize == request.worldSize;
bool ranksPerNodeMatch = constraint.nRanksPerNode == request.nRanksPerNode;
bool collectiveMatch = plan->collective() == request.collective;
bool inPlaceMatch = plan->isInPlace() == request.isInPlace();
size_t effectiveSize =
(request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
bool minSizeMatch = effectiveSize >= plan->minMessageSize();
bool maxSizeMatch = effectiveSize <= plan->maxMessageSize();
bool result = worldSizeMatch && ranksPerNodeMatch && collectiveMatch && inPlaceMatch && minSizeMatch && maxSizeMatch;
return result;
}
} // namespace mscclpp

View File

@@ -12,11 +12,11 @@ static inline bool isCudaTeardownError(cudaError_t err) {
return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
#else // !defined(MSCCLPP_USE_ROCM)
return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure;
err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized;
#endif // !defined(MSCCLPP_USE_ROCM)
}
static inline bool isCuTeardownError(CUresult r) {
[[maybe_unused]] static inline bool isCuTeardownError(CUresult r) {
return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED;
}

Some files were not shown because too many files have changed in this diff Show More