From c8c55eae666759504166be66c2ecaed44f7833c3 Mon Sep 17 00:00:00 2001
From: empyreus <rjsouza1995@gmail.com>
Date: Mon, 11 May 2026 21:59:14 +0000
Subject: [PATCH] final commit for testing

---
 .azure-pipelines/integration-test.yml | 54 +++++++---------
 .azure-pipelines/multi-nodes-test.yml | 93 ++++++++++++++++-----------
 2 files changed, 79 insertions(+), 68 deletions(-)

diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index 45bb1e96..5eed0edc 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -1,3 +1,13 @@
+# =============================================================================
+# Single-node SGLang integration test pipeline.
+#
+# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
+# pool. All deploy / run / teardown logic is delegated to
+# `templates/sglang-test.yml`.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
 trigger:
   branches:
     include:
@@ -25,44 +35,30 @@ pr:
     - docs/**
     - '**/*.md'
 
+parameters:
+# Name of the pre-provisioned Azure VMSS that hosts the GPU test node.
+- name: vmssName
+  type: string
+  default: mscclpp-h100-ci
+
 jobs:
-- job: IntegrationTestA100
-  displayName: Integration test A100
+- job: SGlangTest
+  displayName: SGLANG Test
+  # Matrix is kept (despite having a single entry today) to make it easy to
+  # add more variants (e.g. cuda13, rocm) without restructuring the job.
   strategy:
     matrix:
-      cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
-      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
-
-  pool:
-    name: msccl-ci
-  container:
-    image: $(containerImage)
-
-  steps:
-  - template: templates/integration-test.yml
-    parameters:
-      subscription: mscclpp-ci
-      vmssName: mscclpp-ci
-      gpuArch: '80'
-
-- job: IntegrationTestH100
-  displayName: Integration test H100
-  strategy:
-    matrix:
-      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
-
+      sglang:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
   pool:
     name: msccl-ci-h100
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yml
+  # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
+  - template: templates/sglang-test.yml
     parameters:
       subscription: mscclpp-ci-h100
-      vmssName: mscclpp-h100-ci
-      perfBaselineFile: test/deploy/perf_ndmv5.jsonl
+      vmssName: ${{ parameters.vmssName }}
       gpuArch: '90'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index ee2766fd..001471ee 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -1,3 +1,19 @@
+# =============================================================================
+# Multi-node SGLang integration test pipeline.
+#
+# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
+# High-level flow:
+#   1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
+#      pool. The agent itself has no GPUs.
+#   2. SSH/host configuration is generated so the agent can reach the two
+#      pre-provisioned VMSS GPU nodes.
+#   3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
+#   4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
+#   5. `templates/stop.yml` tears down / stops the VMSS nodes.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
 trigger:
   branches:
     include:
@@ -11,13 +27,29 @@ trigger:
     - docs/**
     - '**/*.md'
 
-# Do not run multi-nodes-test for PR, we can trigger it manually
-pr: none
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
 
 parameters:
+# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
+# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
 - name: vmssName
   type: string
   default: mscclpp-h100-multinode-ci
+# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
+# These IPs are tied to the specific VMSS above; update both together if the
+# VMSS is reprovisioned or renamed.
 - name: hostEntries
   type: string
   default: |
@@ -25,18 +57,22 @@ parameters:
     10.0.0.4 mscclpp-h100-multinode-ci000001
 
 jobs:
-- job: MultiNodesTest
-  displayName: Multi nodes test
+- job: SGlangTestMultiNode
+  displayName: SGLANG Test Multi Node
+  # Matrix is kept (despite having a single entry today) to make it easy to
+  # add more variants (e.g. cuda13, rocm) without restructuring the job.
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
   pool:
     name: mscclpp-multi-node
   container:
-    image: $[ variables['containerImage'] ]
+    image: $(containerImage)
 
   steps:
+  # Ensure the VMSS node hostnames resolve from the pipeline agent container.
+  # Idempotent: only appends lines that are not already present in /etc/hosts.
   - task: Bash@3
     displayName: Add HostEntry
     inputs:
@@ -52,6 +88,10 @@ jobs:
           fi
         done <<< "${{ parameters.hostEntries }}"
 
+  # Generate the SSH config and hostfile consumed by the deploy / test
+  # templates below:
+  #   - config   : SSH client config (custom port + key) for each node
+  #   - hostfile : user@host list used by deploy / test scripts (parallel-ssh)
   - task: Bash@3
     displayName: Generate deploy files
     inputs:
@@ -74,48 +114,23 @@ jobs:
 
         printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
 
-        printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
-
+  # Build MSCCL++ and deploy it onto the VMSS GPU nodes.
   - template: templates/deploy.yml
     parameters:
       subscription: mscclpp-ci-h100
       vmssName: ${{ parameters.vmssName }}
       resourceGroup: mscclpp
       gpuArch: '90'
+      deployArgs: 'multi-node-test true cuda'
+      containerName: 'sglang-mscclpp-test'
 
-  - template: templates/run-remote-task.yml
+  # Run the SGLang multi-node tests across the two GPU nodes.
+  - template: templates/sglang-multi-test.yml
     parameters:
-      name: RunMscclppTest
-      displayName: Run multi-nodes mscclpp-test
-      continueOnError: true
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
-      remoteScript: |
-        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
-
-  - template: templates/run-remote-task.yml
-    parameters:
-      name: RunMultiNodeUnitTest
-      displayName: Run multi-nodes unit tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
-      remoteScript: |
-        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
-
-  - template: templates/run-remote-task.yml
-    parameters:
-      name: RunMultiNodePythonTests
-      displayName: Run multi-nodes python tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
-      remoteScript: |
-        bash /root/mscclpp/test/deploy/run_tests.sh pytests
-
-  - template: templates/run-remote-task.yml
-    parameters:
-      name: RunMultiNodePythonBenchmark
-      displayName: Run multi-nodes python benchmark
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
-      remoteScript: |
-        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
 
+  # Stop/deallocate the VMSS GPU nodes to release resources.
   - template: templates/stop.yml
     parameters:
       subscription: mscclpp-ci-h100