revert to original pipeline files

This commit is contained in:
empyreus
2026-05-11 23:02:57 +00:00
parent c8c55eae66
commit a44613e4c0
2 changed files with 68 additions and 79 deletions

View File

@@ -1,13 +1,3 @@
# =============================================================================
# Single-node SGLang integration test pipeline.
#
# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
# pool. All deploy / run / teardown logic is delegated to
# `templates/sglang-test.yml`.
#
# Docs / non-code changes are excluded from triggering this pipeline.
# =============================================================================
trigger:
branches:
include:
@@ -35,30 +25,44 @@ pr:
- docs/**
- '**/*.md'
parameters:
# Name of the pre-provisioned Azure VMSS that hosts the GPU test node.
- name: vmssName
type: string
default: mscclpp-h100-ci
jobs:
- job: SGlangTest
displayName: SGLANG Test
# Matrix is kept (despite having a single entry today) to make it easy to
# add more variants (e.g. cuda13, rocm) without restructuring the job.
- job: IntegrationTestA100
displayName: Integration test A100
strategy:
matrix:
sglang:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: msccl-ci
container:
image: $(containerImage)
steps:
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
gpuArch: '80'
- job: IntegrationTestH100
displayName: Integration test H100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: msccl-ci-h100
container:
image: $(containerImage)
steps:
# Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
- template: templates/sglang-test.yml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
vmssName: mscclpp-h100-ci
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'

View File

@@ -1,19 +1,3 @@
# =============================================================================
# Multi-node SGLang integration test pipeline.
#
# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
# High-level flow:
# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
# pool. The agent itself has no GPUs.
# 2. SSH/host configuration is generated so the agent can reach the two
# pre-provisioned VMSS GPU nodes.
# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
# 5. `templates/stop.yml` tears down / stops the VMSS nodes.
#
# Docs / non-code changes are excluded from triggering this pipeline.
# =============================================================================
trigger:
branches:
include:
@@ -27,29 +11,13 @@ trigger:
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
# Do not run multi-nodes-test for PR, we can trigger it manually
pr: none
parameters:
# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
- name: vmssName
type: string
default: mscclpp-h100-multinode-ci
# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
# These IPs are tied to the specific VMSS above; update both together if the
# VMSS is reprovisioned or renamed.
- name: hostEntries
type: string
default: |
@@ -57,22 +25,18 @@ parameters:
10.0.0.4 mscclpp-h100-multinode-ci000001
jobs:
- job: SGlangTestMultiNode
displayName: SGLANG Test Multi Node
# Matrix is kept (despite having a single entry today) to make it easy to
# add more variants (e.g. cuda13, rocm) without restructuring the job.
- job: MultiNodesTest
displayName: Multi nodes test
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: mscclpp-multi-node
container:
image: $(containerImage)
image: $[ variables['containerImage'] ]
steps:
# Ensure the VMSS node hostnames resolve from the pipeline agent container.
# Idempotent: only appends lines that are not already present in /etc/hosts.
- task: Bash@3
displayName: Add HostEntry
inputs:
@@ -88,10 +52,6 @@ jobs:
fi
done <<< "${{ parameters.hostEntries }}"
# Generate the SSH config and hostfile consumed by the deploy / test
# templates below:
# - config : SSH client config (custom port + key) for each node
# - hostfile : user@host list used by deploy / test scripts (parallel-ssh)
- task: Bash@3
displayName: Generate deploy files
inputs:
@@ -114,23 +74,48 @@ jobs:
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
# Build MSCCL++ and deploy it onto the VMSS GPU nodes.
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
- template: templates/deploy.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp
gpuArch: '90'
deployArgs: 'multi-node-test true cuda'
containerName: 'sglang-mscclpp-test'
# Run the SGLang multi-node tests across the two GPU nodes.
- template: templates/sglang-multi-test.yml
- template: templates/run-remote-task.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
continueOnError: true
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh pytests
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
# Stop/deallocate the VMSS GPU nodes to release resources.
- template: templates/stop.yml
parameters:
subscription: mscclpp-ci-h100