From bcb392ffdf024401a9d2cdc2503063fd7a6fe823 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sun, 8 Mar 2026 03:33:51 +0000 Subject: [PATCH] updates --- .azure-pipelines/templates/deploy.yaml | 29 +------- .github/workflows/integration-test-backup.yml | 69 ------------------- .github/workflows/ut-backup.yml | 52 -------------- 3 files changed, 3 insertions(+), 147 deletions(-) delete mode 100644 .github/workflows/integration-test-backup.yml delete mode 100644 .github/workflows/ut-backup.yml diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml index 0bafa09c..2e6ccc51 100644 --- a/.azure-pipelines/templates/deploy.yaml +++ b/.azure-pipelines/templates/deploy.yaml @@ -48,30 +48,7 @@ steps: fi curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -# 1. Check VMSS availability (fast, fail-fast) -- task: AzureCLI@2 - name: CheckVMSS - displayName: Check VMSS Availability - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - set -e - INSTANCES=$(az vmss list-instances --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} -o json) - COUNT=$(echo "$INSTANCES" | jq 'length') - if [ "$COUNT" -eq 0 ]; then - echo "##vso[task.logissue type=error]No VMSS instances found for ${{ parameters.vmssName }}" - exit 1 - fi - FAILED=$(echo "$INSTANCES" | jq '[.[] | select(.provisioningState == "Failed")] | length') - if [ "$FAILED" -gt 0 ]; then - echo "##vso[task.logissue type=error]$FAILED VMSS instance(s) in Failed state" - exit 1 - fi - echo "VMSS ${{ parameters.vmssName }}: $COUNT instance(s) available" - -# 2. Build +# 1. Build - task: Bash@3 name: ${{ parameters.buildName }} displayName: ${{ parameters.buildDisplayName }} @@ -114,7 +91,7 @@ steps: pwd > build/BUILD_PREFIX workingDirectory: '$(System.DefaultWorkingDirectory)' -# 3. Download SSH key + install packages + start VMSS +# 2. Download SSH key + install packages + start VMSS - task: DownloadSecureFile@1 name: SshKeyFile displayName: Download key file @@ -140,7 +117,7 @@ steps: inlineScript: | az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} -# 4. Deploy test environment +# 3. Deploy test environment - task: Bash@3 name: DeployTestEnv displayName: Deploy Test Env diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml deleted file mode 100644 index 900e8aba..00000000 --- a/.github/workflows/integration-test-backup.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: IntegrationTest - -on: workflow_dispatch - -jobs: - IntegrationTest: - runs-on: [ self-hosted, A100 ] - defaults: - run: - shell: bash - strategy: - matrix: - cuda: [ cuda11.8, cuda12.2 ] - - container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - - - name: Lock GPU clock frequency - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - - name: Run mscclpp AllGather test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - - - name: Run mscclpp SendRecv test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - - - name: Run mscclpp AllReduce test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl - - - name: Run mscclpp AllToAll test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - - - name: Check collective primitives performance - run: | - set -e - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml deleted file mode 100644 index 8849c353..00000000 --- a/.github/workflows/ut-backup.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: UnitTest - -on: workflow_dispatch - -jobs: - UnitTest: - runs-on: [ self-hosted, A100 ] - defaults: - run: - shell: bash - timeout-minutes: 30 - strategy: - matrix: - cuda: [ cuda11.8, cuda12.2 ] - - container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - working-directory: ${{ github.workspace }} - - - name: LockGPUClock - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - - name: UnitTests - run: | - ./build/bin/unit_tests - - - name: MpUnitTests - run: | - set -e - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests - - - name: PyTests - run: | - set -e - mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x