From c8c55eae666759504166be66c2ecaed44f7833c3 Mon Sep 17 00:00:00 2001 From: empyreus Date: Mon, 11 May 2026 21:59:14 +0000 Subject: [PATCH] final commit for testing --- .azure-pipelines/integration-test.yml | 54 +++++++--------- .azure-pipelines/multi-nodes-test.yml | 93 ++++++++++++++++----------- 2 files changed, 79 insertions(+), 68 deletions(-) diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 45bb1e96..5eed0edc 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -1,3 +1,13 @@ +# ============================================================================= +# Single-node SGLang integration test pipeline. +# +# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100` +# pool. All deploy / run / teardown logic is delegated to +# `templates/sglang-test.yml`. +# +# Docs / non-code changes are excluded from triggering this pipeline. +# ============================================================================= + trigger: branches: include: @@ -25,44 +35,30 @@ pr: - docs/** - '**/*.md' +parameters: +# Name of the pre-provisioned Azure VMSS that hosts the GPU test node. +- name: vmssName + type: string + default: mscclpp-h100-ci + jobs: -- job: IntegrationTestA100 - displayName: Integration test A100 +- job: SGlangTest + displayName: SGLANG Test + # Matrix is kept (despite having a single entry today) to make it easy to + # add more variants (e.g. cuda13, rocm) without restructuring the job. strategy: matrix: - cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 - cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 - - pool: - name: msccl-ci - container: - image: $(containerImage) - - steps: - - template: templates/integration-test.yml - parameters: - subscription: mscclpp-ci - vmssName: mscclpp-ci - gpuArch: '80' - -- job: IntegrationTestH100 - displayName: Integration test H100 - strategy: - matrix: - cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 - + sglang: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 pool: name: msccl-ci-h100 container: image: $(containerImage) steps: - - template: templates/integration-test.yml + # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests. + - template: templates/sglang-test.yml parameters: subscription: mscclpp-ci-h100 - vmssName: mscclpp-h100-ci - perfBaselineFile: test/deploy/perf_ndmv5.jsonl + vmssName: ${{ parameters.vmssName }} gpuArch: '90' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index ee2766fd..001471ee 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -1,3 +1,19 @@ +# ============================================================================= +# Multi-node SGLang integration test pipeline. +# +# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes. +# High-level flow: +# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node` +# pool. The agent itself has no GPUs. +# 2. SSH/host configuration is generated so the agent can reach the two +# pre-provisioned VMSS GPU nodes. +# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes. +# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests. +# 5. `templates/stop.yml` tears down / stops the VMSS nodes. +# +# Docs / non-code changes are excluded from triggering this pipeline. +# ============================================================================= + trigger: branches: include: @@ -11,13 +27,29 @@ trigger: - docs/** - '**/*.md' -# Do not run multi-nodes-test for PR, we can trigger it manually -pr: none +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' parameters: +# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes. +# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001". - name: vmssName type: string default: mscclpp-h100-multinode-ci +# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs. +# These IPs are tied to the specific VMSS above; update both together if the +# VMSS is reprovisioned or renamed. - name: hostEntries type: string default: | @@ -25,18 +57,22 @@ parameters: 10.0.0.4 mscclpp-h100-multinode-ci000001 jobs: -- job: MultiNodesTest - displayName: Multi nodes test +- job: SGlangTestMultiNode + displayName: SGLANG Test Multi Node + # Matrix is kept (despite having a single entry today) to make it easy to + # add more variants (e.g. cuda13, rocm) without restructuring the job. strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 pool: name: mscclpp-multi-node container: - image: $[ variables['containerImage'] ] + image: $(containerImage) steps: + # Ensure the VMSS node hostnames resolve from the pipeline agent container. + # Idempotent: only appends lines that are not already present in /etc/hosts. - task: Bash@3 displayName: Add HostEntry inputs: @@ -52,6 +88,10 @@ jobs: fi done <<< "${{ parameters.hostEntries }}" + # Generate the SSH config and hostfile consumed by the deploy / test + # templates below: + # - config : SSH client config (custom port + key) for each node + # - hostfile : user@host list used by deploy / test scripts (parallel-ssh) - task: Bash@3 displayName: Generate deploy files inputs: @@ -74,48 +114,23 @@ jobs: printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile" - printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi" - + # Build MSCCL++ and deploy it onto the VMSS GPU nodes. - template: templates/deploy.yml parameters: subscription: mscclpp-ci-h100 vmssName: ${{ parameters.vmssName }} resourceGroup: mscclpp gpuArch: '90' + deployArgs: 'multi-node-test true cuda' + containerName: 'sglang-mscclpp-test' - - template: templates/run-remote-task.yml + # Run the SGLang multi-node tests across the two GPU nodes. + - template: templates/sglang-multi-test.yml parameters: - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - continueOnError: true - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test - - - template: templates/run-remote-task.yml - parameters: - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh mp-ut - - - template: templates/run-remote-task.yml - parameters: - name: RunMultiNodePythonTests - displayName: Run multi-nodes python tests - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh pytests - - - template: templates/run-remote-task.yml - parameters: - name: RunMultiNodePythonBenchmark - displayName: Run multi-nodes python benchmark - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' - remoteScript: | - bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + # Stop/deallocate the VMSS GPU nodes to release resources. - template: templates/stop.yml parameters: subscription: mscclpp-ci-h100