diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 0d7226f9..45bb1e96 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -19,35 +19,50 @@ pr: drafts: false paths: exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' - -parameters: -- name: hostEntries - type: string - default: | - 10.0.0.10 mscclit-000000 - 10.0.0.11 mscclit-000001 + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' jobs: -- job: SGlangTest - displayName: SGLANG Test +- job: IntegrationTestA100 + displayName: Integration test A100 strategy: matrix: - sglang: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 - + cuda11: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + pool: + name: msccl-ci + container: + image: $(containerImage) + + steps: + - template: templates/integration-test.yml + parameters: + subscription: mscclpp-ci + vmssName: mscclpp-ci + gpuArch: '80' + +- job: IntegrationTestH100 + displayName: Integration test H100 + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + pool: name: msccl-ci-h100 container: image: $(containerImage) steps: - - template: templates/sglang-test.yml + - template: templates/integration-test.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: mscclpp-h100-ci - gpuArch: '90' \ No newline at end of file + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci + perfBaselineFile: test/deploy/perf_ndmv5.jsonl + gpuArch: '90' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 61eab90b..ee2766fd 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -11,19 +11,8 @@ trigger: - docs/** - '**/*.md' -pr: - branches: - include: - - main - - release/* - drafts: false - paths: - exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' +# Do not run multi-nodes-test for PR, we can trigger it manually +pr: none parameters: - name: vmssName @@ -36,16 +25,16 @@ parameters: 10.0.0.4 mscclpp-h100-multinode-ci000001 jobs: -- job: SGlangTestMultiNode - displayName: SGLANG Test Multi Node +- job: MultiNodesTest + displayName: Multi nodes test strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 pool: name: mscclpp-multi-node container: - image: $(containerImage) + image: $[ variables['containerImage'] ] steps: - task: Bash@3 @@ -68,7 +57,6 @@ jobs: inputs: targetType: 'inline' script: | - nvidia-smi || echo "nvidia-smi not available on agent" set -e VMSS="${{ parameters.vmssName }}" DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy" @@ -90,22 +78,46 @@ jobs: - template: templates/deploy.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: ${{ parameters.vmssName }} + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} resourceGroup: mscclpp - gpuArch: '90' - deployArgs: 'multi-node-test true cuda' - containerName: 'sglang-mscclpp-test' + gpuArch: '90' - - template: templates/sglang-multi-test.yml + - template: templates/run-remote-task.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: mscclpp-h100-multinode-ci - resourceGroup: mscclpp - hostEntries: ${{ parameters.hostEntries }} + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + continueOnError: true + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test + + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mp-ut + + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonTests + displayName: Run multi-nodes python tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh pytests + + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark - template: templates/stop.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: ${{ parameters.vmssName }} + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} resourceGroup: mscclpp diff --git a/.azure-pipelines/sglang-multi-node-test.yml b/.azure-pipelines/sglang-multi-node-test.yml new file mode 100644 index 00000000..937a30ec --- /dev/null +++ b/.azure-pipelines/sglang-multi-node-test.yml @@ -0,0 +1,141 @@ +# ============================================================================= +# Multi-node SGLang integration test pipeline. +# +# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes. +# High-level flow: +# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node` +# pool. The agent itself has no GPUs. +# 2. SSH/host configuration is generated so the agent can reach the two +# pre-provisioned VMSS GPU nodes. +# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes. +# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests. +# 5. `templates/stop.yml` tears down / stops the VMSS nodes. +# +# Docs / non-code changes are excluded from triggering this pipeline. +# ============================================================================= + +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +parameters: +# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes. +# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001". +- name: vmssName + type: string + default: mscclpp-h100-multinode-ci +# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs. +# These IPs are tied to the specific VMSS above; update both together if the +# VMSS is reprovisioned or renamed. +- name: hostEntries + type: string + default: | + 10.0.0.5 mscclpp-h100-multinode-ci000000 + 10.0.0.4 mscclpp-h100-multinode-ci000001 + +jobs: +- job: SGlangTestMultiNode + displayName: SGLANG Test Multi Node + # Matrix is kept (despite having a single entry today) to make it easy to + # add more variants (e.g. cuda13, rocm) without restructuring the job. + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 + pool: + name: mscclpp-multi-node + container: + image: $(containerImage) + + steps: + # Ensure the VMSS node hostnames resolve from the pipeline agent container. + # Idempotent: only appends lines that are not already present in /etc/hosts. + - task: Bash@3 + displayName: Add HostEntry + inputs: + targetType: 'inline' + script: | + while IFS= read -r line; do + [ -z "$line" ] && continue + if ! grep -qxF "$line" /etc/hosts; then + echo "Adding to /etc/hosts: $line" + echo "$line" | sudo tee -a /etc/hosts + else + echo "Entry already exists: $line" + fi + done <<< "${{ parameters.hostEntries }}" + + # Generate the SSH config and hostfiles consumed by the deploy / test + # templates below: + # - config : SSH client config (custom port + key) for each node + # - hostfile : user@host list used by deploy / test scripts + # - hostfile_mpi : bare hostnames used by mpirun + - task: Bash@3 + displayName: Generate deploy files + inputs: + targetType: 'inline' + script: | + set -e + VMSS="${{ parameters.vmssName }}" + DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy" + NODE0="${VMSS}000000" + NODE1="${VMSS}000001" + + echo "Host ${NODE0} + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no + Host ${NODE1} + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no" > "${DEPLOY_DIR}/config" + + printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile" + + printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi" + + # Build MSCCL++ and deploy it onto the VMSS GPU nodes. + - template: templates/deploy.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + resourceGroup: mscclpp + gpuArch: '90' + deployArgs: 'multi-node-test true cuda' + containerName: 'sglang-mscclpp-test' + + # Run the SGLang multi-node tests across the two GPU nodes. + - template: templates/sglang-multi-test.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + + # Stop/deallocate the VMSS GPU nodes to release resources. + - template: templates/stop.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + resourceGroup: mscclpp diff --git a/.azure-pipelines/sglang-test.yml b/.azure-pipelines/sglang-test.yml index fdbf93df..5eed0edc 100644 --- a/.azure-pipelines/sglang-test.yml +++ b/.azure-pipelines/sglang-test.yml @@ -1,3 +1,13 @@ +# ============================================================================= +# Single-node SGLang integration test pipeline. +# +# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100` +# pool. All deploy / run / teardown logic is delegated to +# `templates/sglang-test.yml`. +# +# Docs / non-code changes are excluded from triggering this pipeline. +# ============================================================================= + trigger: branches: include: @@ -19,28 +29,36 @@ pr: drafts: false paths: exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +parameters: +# Name of the pre-provisioned Azure VMSS that hosts the GPU test node. +- name: vmssName + type: string + default: mscclpp-h100-ci jobs: -- job: sglangtest +- job: SGlangTest displayName: SGLANG Test + # Matrix is kept (despite having a single entry today) to make it easy to + # add more variants (e.g. cuda13, rocm) without restructuring the job. strategy: matrix: sglang: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64 - pool: - name: msccl-ci + name: msccl-ci-h100 container: image: $(containerImage) steps: + # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests. - template: templates/sglang-test.yml parameters: - subscription: mscclpp-ci - vmssName: mscclpp-ci - gpuArch: '80' + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + gpuArch: '90' diff --git a/.azure-pipelines/templates/sglang-multi-test.yml b/.azure-pipelines/templates/sglang-multi-test.yml index edfafa50..b092a22e 100644 --- a/.azure-pipelines/templates/sglang-multi-test.yml +++ b/.azure-pipelines/templates/sglang-multi-test.yml @@ -1,85 +1,76 @@ +# ============================================================================= +# SGLang multi-node test template. +# +# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU +# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the +# caller pipeline). Steps: +# 1. Build and install MSCCL++ on each node. +# 2. Install a (currently forked) SGLang on each node, replacing any +# pre-baked copy from the base image. +# 3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled. +# ============================================================================= + parameters: - name: subscription type: string - name: vmssName type: string -- name: perfBaselineFile - type: string - default: 'test/deploy/perf_ndmv4.jsonl' - name: containerName type: string default: 'sglang-mscclpp-test' -- name: resourceGroup - type: string -- name: hostEntries - type: string steps: -# - task: Bash@3 -# displayName: Add HostEntry -# inputs: -# targetType: 'inline' -# script: | -# ENTRY="${{ parameters.hostEntries }}" -# if ! grep -qxF "$ENTRY" /etc/hosts; then -# echo "Adding to /etc/hosts" -# echo "$ENTRY" | sudo tee -a /etc/hosts -# else -# echo "Entry already exists, nothing to do." -# fi - -# - template: deploy.yml -# parameters: -# subscription: ${{ parameters.subscription }} -# vmssName: ${{ parameters.vmssName }} -# deployArgs: 'single-node-test true cuda' -# containerName: ${{ parameters.containerName }} -# resourceGroup: ${{ parameters.resourceGroup }} - template: run-remote-task.yml parameters: name: InstallMscclpp displayName: Install mscclpp - runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' + runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' remoteScript: | rm -rf build - mkdir build - cd build - cmake -DCMAKE_BUILD_TYPE=Release .. + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. make -j - cd .. + cd .. pip install . pip install -r ./python/requirements_cuda12.txt +# TODO: Switch to the official upstream sglang repo once Caio's PR is merged. +# Tracking: the fork below (`caiomcbr/sglang` @ release/v0.5.7) is a personal +# branch and should not remain a long-term CI dependency. - template: run-remote-task.yml parameters: name: InstallSGLang displayName: Install SGLang - runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' + runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' remoteScript: | # Remove any pre-baked sglang from the container image so all nodes # use the freshly cloned fork (otherwise rank 0 imports # /sgl-workspace/sglang while rank 1 imports our fork, causing # version mismatch and NCCL/CUDA errors). - pip uninstall -y sglang sglang-router 2>/dev/null || true + pip uninstall -y sglang sglang-router || true rm -rf /sgl-workspace/sglang || true rm -rf sglang git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git cd sglang - pip install --upgrade pip pip install -e "python" # Sanity check: confirm sglang resolves to our fork on every node. python -c "import sglang, os; p=os.path.dirname(sglang.__file__); print('sglang from:', p); assert '/sgl-workspace' not in p, 'stock sglang still active'" +# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++. +# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must +# match across ranks and be free on node 0). - template: run-remote-task.yml parameters: - name: RunSGLangMutliBenchOneBatch1 + name: RunSGLangMultiBenchOneBatch1 displayName: Run SGLang Multi-Node Bench One Batch - 1 - runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' + runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' remoteScript: | export FLASHINFER_DISABLE_VERSION_CHECK=1 VMSS="${{ parameters.vmssName }}" HOSTNAME=$(hostname) + # Explicit 2-node mapping: hostname suffix -> SGLang node rank. if [ "$HOSTNAME" = "${VMSS}000000" ]; then NODE_RANK=0 elif [ "$HOSTNAME" = "${VMSS}000001" ]; then @@ -88,4 +79,4 @@ steps: echo "Unknown hostname: $HOSTNAME" exit 1 fi - python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp \ No newline at end of file + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp diff --git a/.azure-pipelines/templates/sglang-test.yml b/.azure-pipelines/templates/sglang-test.yml index a3d0c299..6abe6d9a 100644 --- a/.azure-pipelines/templates/sglang-test.yml +++ b/.azure-pipelines/templates/sglang-test.yml @@ -1,11 +1,24 @@ +# ============================================================================= +# SGLang single-node test template. +# +# Runs on the pipeline agent and dispatches remote steps to a single VMSS GPU +# node (via run-remote-task.yml). Steps: +# 1. Deploy: build the test container and bring the VMSS node online. +# 2. Build and install MSCCL++ on the node. +# 3. Install a (currently forked) SGLang. +# 4. Run sglang.bench_one_batch at several batch sizes (kept as separate +# steps for per-batch visibility in the Azure DevOps UI). +# 5. Run a longer end-to-end validation: bring up an sglang server and +# drive it with sglang.bench_serving. +# 6. Run the MSCCL++ all-reduce micro-benchmark via torchrun. +# 7. Stop / deallocate the VMSS node. +# ============================================================================= + parameters: - name: subscription type: string - name: vmssName type: string -- name: perfBaselineFile - type: string - default: 'test/deploy/perf_ndmv4.jsonl' - name: gpuArch type: string - name: containerName @@ -13,83 +26,85 @@ parameters: default: 'sglang-mscclpp-test' steps: +# deployArgs positional fields: - template: deploy.yml parameters: - subscription: ${{ parameters.subscription }} - vmssName: ${{ parameters.vmssName }} - gpuArch: ${{ parameters.gpuArch }} - deployArgs: 'single-node-test true cuda' - containerName: ${{ parameters.containerName }} + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test true cuda' + containerName: ${{ parameters.containerName }} - template: run-remote-task.yml parameters: name: InstallMscclpp displayName: Install mscclpp - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | - echo "PWD: $(pwd)" - ls -la rm -rf build - mkdir build - cd build - cmake -DCMAKE_BUILD_TYPE=Release .. + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. make -j - cd .. + cd .. pip install . pip install -r ./python/requirements_cuda12.txt +# TODO: Switch to the official upstream sglang repo once Caio's PR is merged. +# Tracking: the fork below (`caiomcbr/sglang` @ main) is a personal branch and +# should not remain a long-term CI dependency. Also consider pinning to a +# release branch or commit SHA for reproducibility. - template: run-remote-task.yml parameters: name: InstallSGLang displayName: Install SGLang - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | git clone -b main https://github.com/caiomcbr/sglang.git cd sglang/python - pip install --upgrade pip pip install -e . - template: run-remote-task.yml parameters: name: RunSGLangBenchOneBatch1 displayName: Run SGLang Bench One Batch - 1 - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | export FLASHINFER_DISABLE_VERSION_CHECK=1 - python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp - template: run-remote-task.yml parameters: name: RunSGLangBenchOneBatch2 displayName: Run SGLang Bench One Batch - 2 - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | export FLASHINFER_DISABLE_VERSION_CHECK=1 - python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp - template: run-remote-task.yml parameters: name: RunSGLangBenchOneBatch32 displayName: Run SGLang Bench One Batch - 32 - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | export FLASHINFER_DISABLE_VERSION_CHECK=1 - python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp - template: run-remote-task.yml parameters: name: RunSGLangBenchOneBatch64 displayName: Run SGLang Bench One Batch - 64 - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | export FLASHINFER_DISABLE_VERSION_CHECK=1 - python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp - template: run-remote-task.yml parameters: name: RunSGLangValidationTest displayName: Run SGLang Validation Test - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | export FLASHINFER_DISABLE_VERSION_CHECK=1 @@ -133,7 +148,6 @@ steps: if [ $ELAPSED -ge $MAX_WAIT ]; then echo "Server did not become ready within ${MAX_WAIT}s. Logs:" cat /tmp/sglang_server.log - kill $SERVER_PID 2>/dev/null || true exit 1 fi sleep 5 @@ -163,17 +177,17 @@ steps: echo "Benchmark completed. Results:" cat "$RESULTS_DIR/run.jsonl" || true - # Shut down the server - kill $SERVER_PID 2>/dev/null || true - wait $SERVER_PID 2>/dev/null || true - +# Depends on the `sglang/` source tree cloned by the InstallSGLang step above +# (steps on the same remote share a working directory). - template: run-remote-task.yml parameters: name: RunSGLangTestAllReduce displayName: Run SGLang Test All Reduce - runRemoteArgs: '--container sglang-mscclpp-test' + runRemoteArgs: '--container ${{ parameters.containerName }}' remoteScript: | export FLASHINFER_DISABLE_VERSION_CHECK=1 + # Single-node torchrun: WORLD_SIZE here is the number of *nodes* (1), + # not GPUs. nproc_per_node=gpu spawns one rank per local GPU. export WORLD_SIZE=1 export RANK=0 export MASTER_ADDR=127.0.0.1 @@ -190,4 +204,4 @@ steps: - template: stop.yml parameters: subscription: ${{ parameters.subscription }} - vmssName: ${{ parameters.vmssName }} \ No newline at end of file + vmssName: ${{ parameters.vmssName }} diff --git a/docker/sglang.dockerfile b/docker/sglang.dockerfile index 2b7d81b4..f910f6ca 100644 --- a/docker/sglang.dockerfile +++ b/docker/sglang.dockerfile @@ -6,11 +6,7 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp # Install cmake (not in base image) RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - htop \ - lcov \ - vim \ - && \ + apt-get install -y --no-install-recommends && \ apt-get autoremove -y && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/*