mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
final commit for testing
This commit is contained in:
@@ -1,3 +1,13 @@
|
||||
# =============================================================================
|
||||
# Single-node SGLang integration test pipeline.
|
||||
#
|
||||
# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
|
||||
# pool. All deploy / run / teardown logic is delegated to
|
||||
# `templates/sglang-test.yml`.
|
||||
#
|
||||
# Docs / non-code changes are excluded from triggering this pipeline.
|
||||
# =============================================================================
|
||||
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
@@ -25,44 +35,30 @@ pr:
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
parameters:
|
||||
# Name of the pre-provisioned Azure VMSS that hosts the GPU test node.
|
||||
- name: vmssName
|
||||
type: string
|
||||
default: mscclpp-h100-ci
|
||||
|
||||
jobs:
|
||||
- job: IntegrationTestA100
|
||||
displayName: Integration test A100
|
||||
- job: SGlangTest
|
||||
displayName: SGLANG Test
|
||||
# Matrix is kept (despite having a single entry today) to make it easy to
|
||||
# add more variants (e.g. cuda13, rocm) without restructuring the job.
|
||||
strategy:
|
||||
matrix:
|
||||
cuda11:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
pool:
|
||||
name: msccl-ci
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
gpuArch: '80'
|
||||
|
||||
- job: IntegrationTestH100
|
||||
displayName: Integration test H100
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
sglang:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
|
||||
pool:
|
||||
name: msccl-ci-h100
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yml
|
||||
# Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
|
||||
- template: templates/sglang-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: '90'
|
||||
|
||||
@@ -1,3 +1,19 @@
|
||||
# =============================================================================
|
||||
# Multi-node SGLang integration test pipeline.
|
||||
#
|
||||
# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
|
||||
# High-level flow:
|
||||
# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
|
||||
# pool. The agent itself has no GPUs.
|
||||
# 2. SSH/host configuration is generated so the agent can reach the two
|
||||
# pre-provisioned VMSS GPU nodes.
|
||||
# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
|
||||
# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
|
||||
# 5. `templates/stop.yml` tears down / stops the VMSS nodes.
|
||||
#
|
||||
# Docs / non-code changes are excluded from triggering this pipeline.
|
||||
# =============================================================================
|
||||
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
@@ -11,13 +27,29 @@ trigger:
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
# Do not run multi-nodes-test for PR, we can trigger it manually
|
||||
pr: none
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
parameters:
|
||||
# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
|
||||
# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
|
||||
- name: vmssName
|
||||
type: string
|
||||
default: mscclpp-h100-multinode-ci
|
||||
# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
|
||||
# These IPs are tied to the specific VMSS above; update both together if the
|
||||
# VMSS is reprovisioned or renamed.
|
||||
- name: hostEntries
|
||||
type: string
|
||||
default: |
|
||||
@@ -25,18 +57,22 @@ parameters:
|
||||
10.0.0.4 mscclpp-h100-multinode-ci000001
|
||||
|
||||
jobs:
|
||||
- job: MultiNodesTest
|
||||
displayName: Multi nodes test
|
||||
- job: SGlangTestMultiNode
|
||||
displayName: SGLANG Test Multi Node
|
||||
# Matrix is kept (despite having a single entry today) to make it easy to
|
||||
# add more variants (e.g. cuda13, rocm) without restructuring the job.
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
|
||||
pool:
|
||||
name: mscclpp-multi-node
|
||||
container:
|
||||
image: $[ variables['containerImage'] ]
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
# Ensure the VMSS node hostnames resolve from the pipeline agent container.
|
||||
# Idempotent: only appends lines that are not already present in /etc/hosts.
|
||||
- task: Bash@3
|
||||
displayName: Add HostEntry
|
||||
inputs:
|
||||
@@ -52,6 +88,10 @@ jobs:
|
||||
fi
|
||||
done <<< "${{ parameters.hostEntries }}"
|
||||
|
||||
# Generate the SSH config and hostfile consumed by the deploy / test
|
||||
# templates below:
|
||||
# - config : SSH client config (custom port + key) for each node
|
||||
# - hostfile : user@host list used by deploy / test scripts (parallel-ssh)
|
||||
- task: Bash@3
|
||||
displayName: Generate deploy files
|
||||
inputs:
|
||||
@@ -74,48 +114,23 @@ jobs:
|
||||
|
||||
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
|
||||
|
||||
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
|
||||
|
||||
# Build MSCCL++ and deploy it onto the VMSS GPU nodes.
|
||||
- template: templates/deploy.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
gpuArch: '90'
|
||||
deployArgs: 'multi-node-test true cuda'
|
||||
containerName: 'sglang-mscclpp-test'
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
# Run the SGLang multi-node tests across the two GPU nodes.
|
||||
- template: templates/sglang-multi-test.yml
|
||||
parameters:
|
||||
name: RunMscclppTest
|
||||
displayName: Run multi-nodes mscclpp-test
|
||||
continueOnError: true
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodeUnitTest
|
||||
displayName: Run multi-nodes unit tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonTests
|
||||
displayName: Run multi-nodes python tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh pytests
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonBenchmark
|
||||
displayName: Run multi-nodes python benchmark
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
# Stop/deallocate the VMSS GPU nodes to release resources.
|
||||
- template: templates/stop.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
|
||||
Reference in New Issue
Block a user