mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
clean up for PR
This commit is contained in:
@@ -19,35 +19,50 @@ pr:
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
parameters:
|
||||
- name: hostEntries
|
||||
type: string
|
||||
default: |
|
||||
10.0.0.10 mscclit-000000
|
||||
10.0.0.11 mscclit-000001
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
jobs:
|
||||
- job: SGlangTest
|
||||
displayName: SGLANG Test
|
||||
- job: IntegrationTestA100
|
||||
displayName: Integration test A100
|
||||
strategy:
|
||||
matrix:
|
||||
sglang:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
|
||||
|
||||
cuda11:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
pool:
|
||||
name: msccl-ci
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
gpuArch: '80'
|
||||
|
||||
- job: IntegrationTestH100
|
||||
displayName: Integration test H100
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
pool:
|
||||
name: msccl-ci-h100
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/sglang-test.yml
|
||||
- template: templates/integration-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
gpuArch: '90'
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
|
||||
gpuArch: '90'
|
||||
|
||||
@@ -11,19 +11,8 @@ trigger:
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
# Do not run multi-nodes-test for PR, we can trigger it manually
|
||||
pr: none
|
||||
|
||||
parameters:
|
||||
- name: vmssName
|
||||
@@ -36,16 +25,16 @@ parameters:
|
||||
10.0.0.4 mscclpp-h100-multinode-ci000001
|
||||
|
||||
jobs:
|
||||
- job: SGlangTestMultiNode
|
||||
displayName: SGLANG Test Multi Node
|
||||
- job: MultiNodesTest
|
||||
displayName: Multi nodes test
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
pool:
|
||||
name: mscclpp-multi-node
|
||||
container:
|
||||
image: $(containerImage)
|
||||
image: $[ variables['containerImage'] ]
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
@@ -68,7 +57,6 @@ jobs:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
nvidia-smi || echo "nvidia-smi not available on agent"
|
||||
set -e
|
||||
VMSS="${{ parameters.vmssName }}"
|
||||
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
|
||||
@@ -90,22 +78,46 @@ jobs:
|
||||
|
||||
- template: templates/deploy.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
gpuArch: '90'
|
||||
deployArgs: 'multi-node-test true cuda'
|
||||
containerName: 'sglang-mscclpp-test'
|
||||
gpuArch: '90'
|
||||
|
||||
- template: templates/sglang-multi-test.yml
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-multinode-ci
|
||||
resourceGroup: mscclpp
|
||||
hostEntries: ${{ parameters.hostEntries }}
|
||||
name: RunMscclppTest
|
||||
displayName: Run multi-nodes mscclpp-test
|
||||
continueOnError: true
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodeUnitTest
|
||||
displayName: Run multi-nodes unit tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonTests
|
||||
displayName: Run multi-nodes python tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh pytests
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonBenchmark
|
||||
displayName: Run multi-nodes python benchmark
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
|
||||
|
||||
- template: templates/stop.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
|
||||
141
.azure-pipelines/sglang-multi-node-test.yml
Normal file
141
.azure-pipelines/sglang-multi-node-test.yml
Normal file
@@ -0,0 +1,141 @@
|
||||
# =============================================================================
|
||||
# Multi-node SGLang integration test pipeline.
|
||||
#
|
||||
# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
|
||||
# High-level flow:
|
||||
# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
|
||||
# pool. The agent itself has no GPUs.
|
||||
# 2. SSH/host configuration is generated so the agent can reach the two
|
||||
# pre-provisioned VMSS GPU nodes.
|
||||
# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
|
||||
# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
|
||||
# 5. `templates/stop.yml` tears down / stops the VMSS nodes.
|
||||
#
|
||||
# Docs / non-code changes are excluded from triggering this pipeline.
|
||||
# =============================================================================
|
||||
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
parameters:
|
||||
# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
|
||||
# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
|
||||
- name: vmssName
|
||||
type: string
|
||||
default: mscclpp-h100-multinode-ci
|
||||
# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
|
||||
# These IPs are tied to the specific VMSS above; update both together if the
|
||||
# VMSS is reprovisioned or renamed.
|
||||
- name: hostEntries
|
||||
type: string
|
||||
default: |
|
||||
10.0.0.5 mscclpp-h100-multinode-ci000000
|
||||
10.0.0.4 mscclpp-h100-multinode-ci000001
|
||||
|
||||
jobs:
|
||||
- job: SGlangTestMultiNode
|
||||
displayName: SGLANG Test Multi Node
|
||||
# Matrix is kept (despite having a single entry today) to make it easy to
|
||||
# add more variants (e.g. cuda13, rocm) without restructuring the job.
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
|
||||
pool:
|
||||
name: mscclpp-multi-node
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
# Ensure the VMSS node hostnames resolve from the pipeline agent container.
|
||||
# Idempotent: only appends lines that are not already present in /etc/hosts.
|
||||
- task: Bash@3
|
||||
displayName: Add HostEntry
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
while IFS= read -r line; do
|
||||
[ -z "$line" ] && continue
|
||||
if ! grep -qxF "$line" /etc/hosts; then
|
||||
echo "Adding to /etc/hosts: $line"
|
||||
echo "$line" | sudo tee -a /etc/hosts
|
||||
else
|
||||
echo "Entry already exists: $line"
|
||||
fi
|
||||
done <<< "${{ parameters.hostEntries }}"
|
||||
|
||||
# Generate the SSH config and hostfiles consumed by the deploy / test
|
||||
# templates below:
|
||||
# - config : SSH client config (custom port + key) for each node
|
||||
# - hostfile : user@host list used by deploy / test scripts
|
||||
# - hostfile_mpi : bare hostnames used by mpirun
|
||||
- task: Bash@3
|
||||
displayName: Generate deploy files
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
VMSS="${{ parameters.vmssName }}"
|
||||
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
|
||||
NODE0="${VMSS}000000"
|
||||
NODE1="${VMSS}000001"
|
||||
|
||||
echo "Host ${NODE0}
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
Host ${NODE1}
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
|
||||
|
||||
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
|
||||
|
||||
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
|
||||
|
||||
# Build MSCCL++ and deploy it onto the VMSS GPU nodes.
|
||||
- template: templates/deploy.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
gpuArch: '90'
|
||||
deployArgs: 'multi-node-test true cuda'
|
||||
containerName: 'sglang-mscclpp-test'
|
||||
|
||||
# Run the SGLang multi-node tests across the two GPU nodes.
|
||||
- template: templates/sglang-multi-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
# Stop/deallocate the VMSS GPU nodes to release resources.
|
||||
- template: templates/stop.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
@@ -1,3 +1,13 @@
|
||||
# =============================================================================
|
||||
# Single-node SGLang integration test pipeline.
|
||||
#
|
||||
# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
|
||||
# pool. All deploy / run / teardown logic is delegated to
|
||||
# `templates/sglang-test.yml`.
|
||||
#
|
||||
# Docs / non-code changes are excluded from triggering this pipeline.
|
||||
# =============================================================================
|
||||
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
@@ -19,28 +29,36 @@ pr:
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
parameters:
|
||||
# Name of the pre-provisioned Azure VMSS that hosts the GPU test node.
|
||||
- name: vmssName
|
||||
type: string
|
||||
default: mscclpp-h100-ci
|
||||
|
||||
jobs:
|
||||
- job: sglangtest
|
||||
- job: SGlangTest
|
||||
displayName: SGLANG Test
|
||||
# Matrix is kept (despite having a single entry today) to make it easy to
|
||||
# add more variants (e.g. cuda13, rocm) without restructuring the job.
|
||||
strategy:
|
||||
matrix:
|
||||
sglang:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
|
||||
|
||||
pool:
|
||||
name: msccl-ci
|
||||
name: msccl-ci-h100
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
# Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
|
||||
- template: templates/sglang-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
gpuArch: '80'
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: '90'
|
||||
|
||||
@@ -1,85 +1,76 @@
|
||||
# =============================================================================
|
||||
# SGLang multi-node test template.
|
||||
#
|
||||
# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU
|
||||
# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the
|
||||
# caller pipeline). Steps:
|
||||
# 1. Build and install MSCCL++ on each node.
|
||||
# 2. Install a (currently forked) SGLang on each node, replacing any
|
||||
# pre-baked copy from the base image.
|
||||
# 3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled.
|
||||
# =============================================================================
|
||||
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: perfBaselineFile
|
||||
type: string
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
- name: containerName
|
||||
type: string
|
||||
default: 'sglang-mscclpp-test'
|
||||
- name: resourceGroup
|
||||
type: string
|
||||
- name: hostEntries
|
||||
type: string
|
||||
|
||||
steps:
|
||||
# - task: Bash@3
|
||||
# displayName: Add HostEntry
|
||||
# inputs:
|
||||
# targetType: 'inline'
|
||||
# script: |
|
||||
# ENTRY="${{ parameters.hostEntries }}"
|
||||
# if ! grep -qxF "$ENTRY" /etc/hosts; then
|
||||
# echo "Adding to /etc/hosts"
|
||||
# echo "$ENTRY" | sudo tee -a /etc/hosts
|
||||
# else
|
||||
# echo "Entry already exists, nothing to do."
|
||||
# fi
|
||||
|
||||
# - template: deploy.yml
|
||||
# parameters:
|
||||
# subscription: ${{ parameters.subscription }}
|
||||
# vmssName: ${{ parameters.vmssName }}
|
||||
# deployArgs: 'single-node-test true cuda'
|
||||
# containerName: ${{ parameters.containerName }}
|
||||
# resourceGroup: ${{ parameters.resourceGroup }}
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallMscclpp
|
||||
displayName: Install mscclpp
|
||||
runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
|
||||
remoteScript: |
|
||||
rm -rf build
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
cd ..
|
||||
cd ..
|
||||
pip install .
|
||||
pip install -r ./python/requirements_cuda12.txt
|
||||
|
||||
# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
|
||||
# Tracking: the fork below (`caiomcbr/sglang` @ release/v0.5.7) is a personal
|
||||
# branch and should not remain a long-term CI dependency.
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallSGLang
|
||||
displayName: Install SGLang
|
||||
runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
|
||||
remoteScript: |
|
||||
# Remove any pre-baked sglang from the container image so all nodes
|
||||
# use the freshly cloned fork (otherwise rank 0 imports
|
||||
# /sgl-workspace/sglang while rank 1 imports our fork, causing
|
||||
# version mismatch and NCCL/CUDA errors).
|
||||
pip uninstall -y sglang sglang-router 2>/dev/null || true
|
||||
pip uninstall -y sglang sglang-router || true
|
||||
rm -rf /sgl-workspace/sglang || true
|
||||
rm -rf sglang
|
||||
git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git
|
||||
cd sglang
|
||||
pip install --upgrade pip
|
||||
pip install -e "python"
|
||||
# Sanity check: confirm sglang resolves to our fork on every node.
|
||||
python -c "import sglang, os; p=os.path.dirname(sglang.__file__); print('sglang from:', p); assert '/sgl-workspace' not in p, 'stock sglang still active'"
|
||||
|
||||
# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++.
|
||||
# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must
|
||||
# match across ranks and be free on node 0).
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunSGLangMutliBenchOneBatch1
|
||||
name: RunSGLangMultiBenchOneBatch1
|
||||
displayName: Run SGLang Multi-Node Bench One Batch - 1
|
||||
runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
|
||||
remoteScript: |
|
||||
export FLASHINFER_DISABLE_VERSION_CHECK=1
|
||||
VMSS="${{ parameters.vmssName }}"
|
||||
HOSTNAME=$(hostname)
|
||||
# Explicit 2-node mapping: hostname suffix -> SGLang node rank.
|
||||
if [ "$HOSTNAME" = "${VMSS}000000" ]; then
|
||||
NODE_RANK=0
|
||||
elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
|
||||
@@ -88,4 +79,4 @@ steps:
|
||||
echo "Unknown hostname: $HOSTNAME"
|
||||
exit 1
|
||||
fi
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp
|
||||
|
||||
@@ -1,11 +1,24 @@
|
||||
# =============================================================================
|
||||
# SGLang single-node test template.
|
||||
#
|
||||
# Runs on the pipeline agent and dispatches remote steps to a single VMSS GPU
|
||||
# node (via run-remote-task.yml). Steps:
|
||||
# 1. Deploy: build the test container and bring the VMSS node online.
|
||||
# 2. Build and install MSCCL++ on the node.
|
||||
# 3. Install a (currently forked) SGLang.
|
||||
# 4. Run sglang.bench_one_batch at several batch sizes (kept as separate
|
||||
# steps for per-batch visibility in the Azure DevOps UI).
|
||||
# 5. Run a longer end-to-end validation: bring up an sglang server and
|
||||
# drive it with sglang.bench_serving.
|
||||
# 6. Run the MSCCL++ all-reduce micro-benchmark via torchrun.
|
||||
# 7. Stop / deallocate the VMSS node.
|
||||
# =============================================================================
|
||||
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: perfBaselineFile
|
||||
type: string
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
- name: containerName
|
||||
@@ -13,83 +26,85 @@ parameters:
|
||||
default: 'sglang-mscclpp-test'
|
||||
|
||||
steps:
|
||||
# deployArgs positional fields: <test-mode> <use-gpu> <cuda|rocm>
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test true cuda'
|
||||
containerName: ${{ parameters.containerName }}
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test true cuda'
|
||||
containerName: ${{ parameters.containerName }}
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallMscclpp
|
||||
displayName: Install mscclpp
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
echo "PWD: $(pwd)"
|
||||
ls -la
|
||||
rm -rf build
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
cd ..
|
||||
cd ..
|
||||
pip install .
|
||||
pip install -r ./python/requirements_cuda12.txt
|
||||
|
||||
# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
|
||||
# Tracking: the fork below (`caiomcbr/sglang` @ main) is a personal branch and
|
||||
# should not remain a long-term CI dependency. Also consider pinning to a
|
||||
# release branch or commit SHA for reproducibility.
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallSGLang
|
||||
displayName: Install SGLang
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
git clone -b main https://github.com/caiomcbr/sglang.git
|
||||
cd sglang/python
|
||||
pip install --upgrade pip
|
||||
pip install -e .
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunSGLangBenchOneBatch1
|
||||
displayName: Run SGLang Bench One Batch - 1
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
export FLASHINFER_DISABLE_VERSION_CHECK=1
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunSGLangBenchOneBatch2
|
||||
displayName: Run SGLang Bench One Batch - 2
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
export FLASHINFER_DISABLE_VERSION_CHECK=1
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunSGLangBenchOneBatch32
|
||||
displayName: Run SGLang Bench One Batch - 32
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
export FLASHINFER_DISABLE_VERSION_CHECK=1
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunSGLangBenchOneBatch64
|
||||
displayName: Run SGLang Bench One Batch - 64
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
export FLASHINFER_DISABLE_VERSION_CHECK=1
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunSGLangValidationTest
|
||||
displayName: Run SGLang Validation Test
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
export FLASHINFER_DISABLE_VERSION_CHECK=1
|
||||
|
||||
@@ -133,7 +148,6 @@ steps:
|
||||
if [ $ELAPSED -ge $MAX_WAIT ]; then
|
||||
echo "Server did not become ready within ${MAX_WAIT}s. Logs:"
|
||||
cat /tmp/sglang_server.log
|
||||
kill $SERVER_PID 2>/dev/null || true
|
||||
exit 1
|
||||
fi
|
||||
sleep 5
|
||||
@@ -163,17 +177,17 @@ steps:
|
||||
echo "Benchmark completed. Results:"
|
||||
cat "$RESULTS_DIR/run.jsonl" || true
|
||||
|
||||
# Shut down the server
|
||||
kill $SERVER_PID 2>/dev/null || true
|
||||
wait $SERVER_PID 2>/dev/null || true
|
||||
|
||||
# Depends on the `sglang/` source tree cloned by the InstallSGLang step above
|
||||
# (steps on the same remote share a working directory).
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunSGLangTestAllReduce
|
||||
displayName: Run SGLang Test All Reduce
|
||||
runRemoteArgs: '--container sglang-mscclpp-test'
|
||||
runRemoteArgs: '--container ${{ parameters.containerName }}'
|
||||
remoteScript: |
|
||||
export FLASHINFER_DISABLE_VERSION_CHECK=1
|
||||
# Single-node torchrun: WORLD_SIZE here is the number of *nodes* (1),
|
||||
# not GPUs. nproc_per_node=gpu spawns one rank per local GPU.
|
||||
export WORLD_SIZE=1
|
||||
export RANK=0
|
||||
export MASTER_ADDR=127.0.0.1
|
||||
@@ -190,4 +204,4 @@ steps:
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
@@ -6,11 +6,7 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
|
||||
|
||||
# Install cmake (not in base image)
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
htop \
|
||||
lcov \
|
||||
vim \
|
||||
&& \
|
||||
apt-get install -y --no-install-recommends && \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
|
||||
Reference in New Issue
Block a user