diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 5eed0edc..45bb1e96 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -1,13 +1,3 @@ -# ============================================================================= -# Single-node SGLang integration test pipeline. -# -# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100` -# pool. All deploy / run / teardown logic is delegated to -# `templates/sglang-test.yml`. -# -# Docs / non-code changes are excluded from triggering this pipeline. -# ============================================================================= - trigger: branches: include: @@ -35,30 +25,44 @@ pr: - docs/** - '**/*.md' -parameters: -# Name of the pre-provisioned Azure VMSS that hosts the GPU test node. -- name: vmssName - type: string - default: mscclpp-h100-ci - jobs: -- job: SGlangTest - displayName: SGLANG Test - # Matrix is kept (despite having a single entry today) to make it easy to - # add more variants (e.g. cuda13, rocm) without restructuring the job. +- job: IntegrationTestA100 + displayName: Integration test A100 strategy: matrix: - sglang: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 + cuda11: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + pool: + name: msccl-ci + container: + image: $(containerImage) + + steps: + - template: templates/integration-test.yml + parameters: + subscription: mscclpp-ci + vmssName: mscclpp-ci + gpuArch: '80' + +- job: IntegrationTestH100 + displayName: Integration test H100 + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + pool: name: msccl-ci-h100 container: image: $(containerImage) steps: - # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests. - - template: templates/sglang-test.yml + - template: templates/integration-test.yml parameters: subscription: mscclpp-ci-h100 - vmssName: ${{ parameters.vmssName }} + vmssName: mscclpp-h100-ci + perfBaselineFile: test/deploy/perf_ndmv5.jsonl gpuArch: '90' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 001471ee..ee2766fd 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -1,19 +1,3 @@ -# ============================================================================= -# Multi-node SGLang integration test pipeline. -# -# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes. -# High-level flow: -# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node` -# pool. The agent itself has no GPUs. -# 2. SSH/host configuration is generated so the agent can reach the two -# pre-provisioned VMSS GPU nodes. -# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes. -# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests. -# 5. `templates/stop.yml` tears down / stops the VMSS nodes. -# -# Docs / non-code changes are excluded from triggering this pipeline. -# ============================================================================= - trigger: branches: include: @@ -27,29 +11,13 @@ trigger: - docs/** - '**/*.md' -pr: - branches: - include: - - main - - release/* - drafts: false - paths: - exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' +# Do not run multi-nodes-test for PR, we can trigger it manually +pr: none parameters: -# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes. -# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001". - name: vmssName type: string default: mscclpp-h100-multinode-ci -# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs. -# These IPs are tied to the specific VMSS above; update both together if the -# VMSS is reprovisioned or renamed. - name: hostEntries type: string default: | @@ -57,22 +25,18 @@ parameters: 10.0.0.4 mscclpp-h100-multinode-ci000001 jobs: -- job: SGlangTestMultiNode - displayName: SGLANG Test Multi Node - # Matrix is kept (despite having a single entry today) to make it easy to - # add more variants (e.g. cuda13, rocm) without restructuring the job. +- job: MultiNodesTest + displayName: Multi nodes test strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 pool: name: mscclpp-multi-node container: - image: $(containerImage) + image: $[ variables['containerImage'] ] steps: - # Ensure the VMSS node hostnames resolve from the pipeline agent container. - # Idempotent: only appends lines that are not already present in /etc/hosts. - task: Bash@3 displayName: Add HostEntry inputs: @@ -88,10 +52,6 @@ jobs: fi done <<< "${{ parameters.hostEntries }}" - # Generate the SSH config and hostfile consumed by the deploy / test - # templates below: - # - config : SSH client config (custom port + key) for each node - # - hostfile : user@host list used by deploy / test scripts (parallel-ssh) - task: Bash@3 displayName: Generate deploy files inputs: @@ -114,23 +74,48 @@ jobs: printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile" - # Build MSCCL++ and deploy it onto the VMSS GPU nodes. + printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi" + - template: templates/deploy.yml parameters: subscription: mscclpp-ci-h100 vmssName: ${{ parameters.vmssName }} resourceGroup: mscclpp gpuArch: '90' - deployArgs: 'multi-node-test true cuda' - containerName: 'sglang-mscclpp-test' - # Run the SGLang multi-node tests across the two GPU nodes. - - template: templates/sglang-multi-test.yml + - template: templates/run-remote-task.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: ${{ parameters.vmssName }} + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + continueOnError: true + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test + + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mp-ut + + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonTests + displayName: Run multi-nodes python tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh pytests + + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark - # Stop/deallocate the VMSS GPU nodes to release resources. - template: templates/stop.yml parameters: subscription: mscclpp-ci-h100