revert to original pipeline files

2026-05-12 01:10:22 +00:00 · 2026-05-11 23:02:57 +00:00
parent c8c55eae66
commit a44613e4c0
2 changed files with 68 additions and 79 deletions
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -1,13 +1,3 @@
-# =============================================================================
-# Single-node SGLang integration test pipeline.
-#
-# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
-# pool. All deploy / run / teardown logic is delegated to
-# `templates/sglang-test.yml`.
-#
-# Docs / non-code changes are excluded from triggering this pipeline.
-# =============================================================================
-
 trigger:
  branches:
    include:
@@ -35,30 +25,44 @@ pr:
    - docs/**
    - '**/*.md'

-parameters:
-# Name of the pre-provisioned Azure VMSS that hosts the GPU test node.
- name: vmssName
-  type: string
-  default: mscclpp-h100-ci
-
 jobs:
- job: SGlangTest
-  displayName: SGLANG Test
-  # Matrix is kept (despite having a single entry today) to make it easy to
-  # add more variants (e.g. cuda13, rocm) without restructuring the job.
+- job: IntegrationTestA100
+  displayName: Integration test A100
  strategy:
    matrix:
-      sglang:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
+      cuda11:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  pool:
+    name: msccl-ci
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/integration-test.yml
+    parameters:
+      subscription: mscclpp-ci
+      vmssName: mscclpp-ci
+      gpuArch: '80'
+
+- job: IntegrationTestH100
+  displayName: Integration test H100
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
  pool:
    name: msccl-ci-h100
  container:
    image: $(containerImage)

  steps:
-  # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
-  - template: templates/sglang-test.yml
+  - template: templates/integration-test.yml
    parameters:
      subscription: mscclpp-ci-h100
-      vmssName: ${{ parameters.vmssName }}
+      vmssName: mscclpp-h100-ci
+      perfBaselineFile: test/deploy/perf_ndmv5.jsonl
      gpuArch: '90'
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -1,19 +1,3 @@
-# =============================================================================
-# Multi-node SGLang integration test pipeline.
-#
-# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
-# High-level flow:
-#   1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
-#      pool. The agent itself has no GPUs.
-#   2. SSH/host configuration is generated so the agent can reach the two
-#      pre-provisioned VMSS GPU nodes.
-#   3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
-#   4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
-#   5. `templates/stop.yml` tears down / stops the VMSS nodes.
-#
-# Docs / non-code changes are excluded from triggering this pipeline.
-# =============================================================================
-
 trigger:
  branches:
    include:
@@ -27,29 +11,13 @@ trigger:
    - docs/**
    - '**/*.md'

-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
+# Do not run multi-nodes-test for PR, we can trigger it manually
+pr: none

 parameters:
-# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
-# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
 - name: vmssName
  type: string
  default: mscclpp-h100-multinode-ci
-# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
-# These IPs are tied to the specific VMSS above; update both together if the
-# VMSS is reprovisioned or renamed.
 - name: hostEntries
  type: string
  default: |
@@ -57,22 +25,18 @@ parameters:
    10.0.0.4 mscclpp-h100-multinode-ci000001

 jobs:
- job: SGlangTestMultiNode
-  displayName: SGLANG Test Multi Node
-  # Matrix is kept (despite having a single entry today) to make it easy to
-  # add more variants (e.g. cuda13, rocm) without restructuring the job.
+- job: MultiNodesTest
+  displayName: Multi nodes test
  strategy:
    matrix:
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
  pool:
    name: mscclpp-multi-node
  container:
-    image: $(containerImage)
+    image: $[ variables['containerImage'] ]

  steps:
-  # Ensure the VMSS node hostnames resolve from the pipeline agent container.
-  # Idempotent: only appends lines that are not already present in /etc/hosts.
  - task: Bash@3
    displayName: Add HostEntry
    inputs:
@@ -88,10 +52,6 @@ jobs:
          fi
        done <<< "${{ parameters.hostEntries }}"

-  # Generate the SSH config and hostfile consumed by the deploy / test
-  # templates below:
-  #   - config   : SSH client config (custom port + key) for each node
-  #   - hostfile : user@host list used by deploy / test scripts (parallel-ssh)
  - task: Bash@3
    displayName: Generate deploy files
    inputs:
@@ -114,23 +74,48 @@ jobs:

        printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"

-  # Build MSCCL++ and deploy it onto the VMSS GPU nodes.
+        printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
+
  - template: templates/deploy.yml
    parameters:
      subscription: mscclpp-ci-h100
      vmssName: ${{ parameters.vmssName }}
      resourceGroup: mscclpp
      gpuArch: '90'
-      deployArgs: 'multi-node-test true cuda'
-      containerName: 'sglang-mscclpp-test'

-  # Run the SGLang multi-node tests across the two GPU nodes.
-  - template: templates/sglang-multi-test.yml
+  - template: templates/run-remote-task.yml
    parameters:
-      subscription: mscclpp-ci-h100
-      vmssName: ${{ parameters.vmssName }}
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      continueOnError: true
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
+
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
+
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests
+
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark

-  # Stop/deallocate the VMSS GPU nodes to release resources.
  - template: templates/stop.yml
    parameters:
      subscription: mscclpp-ci-h100