From 8fb751470bba8f36dd69cfb33e403145a9eda2dd Mon Sep 17 00:00:00 2001 From: empyreus Date: Tue, 7 Apr 2026 17:15:05 +0000 Subject: [PATCH] add multi node --- .azure-pipelines/integration-test.yml | 14 ++- .../templates/sglang-multi-test.yml | 86 +++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 .azure-pipelines/templates/sglang-multi-test.yml diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 7aebe121..84d4e6c8 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -26,7 +26,7 @@ pr: - '**/*.md' jobs: -- job: sglangtest +- job: SGlangTest displayName: SGLANG Test strategy: matrix: @@ -44,3 +44,15 @@ jobs: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci gpuArch: '90' + +- job: SGlangMultiNodeTest + displayName: SGLANG Multi-Node Test + strategy: + matrix: + sglang: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 + + pool: + name: mscclpp-it + container: + image: $(containerImage) \ No newline at end of file diff --git a/.azure-pipelines/templates/sglang-multi-test.yml b/.azure-pipelines/templates/sglang-multi-test.yml new file mode 100644 index 00000000..d4d4df3a --- /dev/null +++ b/.azure-pipelines/templates/sglang-multi-test.yml @@ -0,0 +1,86 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: perfBaselineFile + type: string + default: 'test/deploy/perf_ndmv4.jsonl' +- name: gpuArch + type: string +- name: containerName + type: string + default: 'sglang-mscclpp-test' + +steps: +- task: Bash@3 + displayName: Add HostEntry + inputs: + targetType: 'inline' + script: | + ENTRY="${{ parameters.hostEntries }}" + if ! grep -qxF "$ENTRY" /etc/hosts; then + echo "Adding to /etc/hosts" + echo "$ENTRY" | sudo tee -a /etc/hosts + else + echo "Entry already exists, nothing to do." + fi + +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test true cuda' + containerName: ${{ parameters.containerName }} + +- template: run-remote-task.yml + parameters: + name: InstallMscclpp + displayName: Install mscclpp + runRemoteArgs: '--container sglang-mscclpp-test' + remoteScript: | + cd mscclpp + rm -rf build + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + cd .. + pip install . + pip install -r ./python/requirements_cuda12.txt + +- template: run-remote-task.yml + parameters: + name: InstallSGLang + displayName: Install SGLang + runRemoteArgs: '--container sglang-mscclpp-test' + remoteScript: | + git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git + cd sglang + pip install --upgrade pip + pip install -e "python" + +- template: run-remote-task.yml + parameters: + name: RunSGLangMutliBenchOneBatch1 + displayName: Run SGLang Multi-Node Bench One Batch - 1 + runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' + remoteScript: | + export FLASHINFER_DISABLE_VERSION_CHECK=1 + HOSTNAME=$(hostname) + if [ "$HOSTNAME" = "mscclit-000000" ]; then + NODE_RANK=0 + elif [ "$HOSTNAME" = "mscclit-000001" ]; then + NODE_RANK=1 + else + echo "Unknown hostname: $HOSTNAME" + exit 1 + fi + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr mscclit-000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} +