diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index 0d7226f9..45bb1e96 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -19,35 +19,50 @@ pr:
   drafts: false
   paths:
     exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-parameters:
-- name: hostEntries
-  type: string
-  default: |
-    10.0.0.10 mscclit-000000
-    10.0.0.11 mscclit-000001
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
 
 jobs:
-- job: SGlangTest
-  displayName: SGLANG Test
+- job: IntegrationTestA100
+  displayName: Integration test A100
   strategy:
     matrix:
-      sglang:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
-        
+      cuda11:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  pool:
+    name: msccl-ci
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/integration-test.yml
+    parameters:
+      subscription: mscclpp-ci
+      vmssName: mscclpp-ci
+      gpuArch: '80'
+
+- job: IntegrationTestH100
+  displayName: Integration test H100
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
   pool:
     name: msccl-ci-h100
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/sglang-test.yml
+  - template: templates/integration-test.yml
     parameters:
-      subscription:     mscclpp-ci-h100
-      vmssName:         mscclpp-h100-ci
-      gpuArch:          '90'
\ No newline at end of file
+      subscription: mscclpp-ci-h100
+      vmssName: mscclpp-h100-ci
+      perfBaselineFile: test/deploy/perf_ndmv5.jsonl
+      gpuArch: '90'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 61eab90b..ee2766fd 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -11,19 +11,8 @@ trigger:
     - docs/**
     - '**/*.md'
 
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
+# Do not run multi-nodes-test for PR, we can trigger it manually
+pr: none
 
 parameters:
 - name: vmssName
@@ -36,16 +25,16 @@ parameters:
     10.0.0.4 mscclpp-h100-multinode-ci000001
 
 jobs:
-- job: SGlangTestMultiNode
-  displayName: SGLANG Test Multi Node
+- job: MultiNodesTest
+  displayName: Multi nodes test
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
   pool:
     name: mscclpp-multi-node
   container:
-    image: $(containerImage)
+    image: $[ variables['containerImage'] ]
 
   steps:
   - task: Bash@3
@@ -68,7 +57,6 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        nvidia-smi || echo "nvidia-smi not available on agent"
         set -e
         VMSS="${{ parameters.vmssName }}"
         DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
@@ -90,22 +78,46 @@ jobs:
 
   - template: templates/deploy.yml
     parameters:
-      subscription:  mscclpp-ci-h100
-      vmssName:      ${{ parameters.vmssName }}
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
       resourceGroup: mscclpp
-      gpuArch:       '90'
-      deployArgs:    'multi-node-test true cuda'
-      containerName: 'sglang-mscclpp-test'
+      gpuArch: '90'
 
-  - template: templates/sglang-multi-test.yml
+  - template: templates/run-remote-task.yml
     parameters:
-        subscription:  mscclpp-ci-h100
-        vmssName:      mscclpp-h100-multinode-ci
-        resourceGroup: mscclpp
-        hostEntries:   ${{ parameters.hostEntries }}
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      continueOnError: true
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
+
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
+
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests
+
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
   - template: templates/stop.yml
     parameters:
-      subscription:  mscclpp-ci-h100
-      vmssName:      ${{ parameters.vmssName }}
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
       resourceGroup: mscclpp
diff --git a/.azure-pipelines/sglang-multi-node-test.yml b/.azure-pipelines/sglang-multi-node-test.yml
new file mode 100644
index 00000000..937a30ec
--- /dev/null
+++ b/.azure-pipelines/sglang-multi-node-test.yml
@@ -0,0 +1,141 @@
+# =============================================================================
+# Multi-node SGLang integration test pipeline.
+#
+# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
+# High-level flow:
+#   1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
+#      pool. The agent itself has no GPUs.
+#   2. SSH/host configuration is generated so the agent can reach the two
+#      pre-provisioned VMSS GPU nodes.
+#   3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
+#   4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
+#   5. `templates/stop.yml` tears down / stops the VMSS nodes.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+parameters:
+# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
+# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
+- name: vmssName
+  type: string
+  default: mscclpp-h100-multinode-ci
+# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
+# These IPs are tied to the specific VMSS above; update both together if the
+# VMSS is reprovisioned or renamed.
+- name: hostEntries
+  type: string
+  default: |
+    10.0.0.5 mscclpp-h100-multinode-ci000000
+    10.0.0.4 mscclpp-h100-multinode-ci000001
+
+jobs:
+- job: SGlangTestMultiNode
+  displayName: SGLANG Test Multi Node
+  # Matrix is kept (despite having a single entry today) to make it easy to
+  # add more variants (e.g. cuda13, rocm) without restructuring the job.
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
+  pool:
+    name: mscclpp-multi-node
+  container:
+    image: $(containerImage)
+
+  steps:
+  # Ensure the VMSS node hostnames resolve from the pipeline agent container.
+  # Idempotent: only appends lines that are not already present in /etc/hosts.
+  - task: Bash@3
+    displayName: Add HostEntry
+    inputs:
+      targetType: 'inline'
+      script: |
+        while IFS= read -r line; do
+          [ -z "$line" ] && continue
+          if ! grep -qxF "$line" /etc/hosts; then
+            echo "Adding to /etc/hosts: $line"
+            echo "$line" | sudo tee -a /etc/hosts
+          else
+            echo "Entry already exists: $line"
+          fi
+        done <<< "${{ parameters.hostEntries }}"
+
+  # Generate the SSH config and hostfiles consumed by the deploy / test
+  # templates below:
+  #   - config       : SSH client config (custom port + key) for each node
+  #   - hostfile     : user@host list used by deploy / test scripts
+  #   - hostfile_mpi : bare hostnames used by mpirun
+  - task: Bash@3
+    displayName: Generate deploy files
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        VMSS="${{ parameters.vmssName }}"
+        DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
+        NODE0="${VMSS}000000"
+        NODE1="${VMSS}000001"
+
+        echo "Host ${NODE0}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no
+        Host ${NODE1}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
+
+        printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
+
+        printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
+
+  # Build MSCCL++ and deploy it onto the VMSS GPU nodes.
+  - template: templates/deploy.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
+      gpuArch: '90'
+      deployArgs: 'multi-node-test true cuda'
+      containerName: 'sglang-mscclpp-test'
+
+  # Run the SGLang multi-node tests across the two GPU nodes.
+  - template: templates/sglang-multi-test.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+
+  # Stop/deallocate the VMSS GPU nodes to release resources.
+  - template: templates/stop.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
diff --git a/.azure-pipelines/sglang-test.yml b/.azure-pipelines/sglang-test.yml
index fdbf93df..5eed0edc 100644
--- a/.azure-pipelines/sglang-test.yml
+++ b/.azure-pipelines/sglang-test.yml
@@ -1,3 +1,13 @@
+# =============================================================================
+# Single-node SGLang integration test pipeline.
+#
+# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
+# pool. All deploy / run / teardown logic is delegated to
+# `templates/sglang-test.yml`.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
 trigger:
   branches:
     include:
@@ -19,28 +29,36 @@ pr:
   drafts: false
   paths:
     exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+parameters:
+# Name of the pre-provisioned Azure VMSS that hosts the GPU test node.
+- name: vmssName
+  type: string
+  default: mscclpp-h100-ci
 
 jobs:
-- job: sglangtest
+- job: SGlangTest
   displayName: SGLANG Test
+  # Matrix is kept (despite having a single entry today) to make it easy to
+  # add more variants (e.g. cuda13, rocm) without restructuring the job.
   strategy:
     matrix:
       sglang:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
-
   pool:
-    name: msccl-ci
+    name: msccl-ci-h100
   container:
     image: $(containerImage)
 
   steps:
+  # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
   - template: templates/sglang-test.yml
     parameters:
-      subscription:     mscclpp-ci
-      vmssName:         mscclpp-ci
-      gpuArch:          '80'
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+      gpuArch: '90'
diff --git a/.azure-pipelines/templates/sglang-multi-test.yml b/.azure-pipelines/templates/sglang-multi-test.yml
index edfafa50..b092a22e 100644
--- a/.azure-pipelines/templates/sglang-multi-test.yml
+++ b/.azure-pipelines/templates/sglang-multi-test.yml
@@ -1,85 +1,76 @@
+# =============================================================================
+# SGLang multi-node test template.
+#
+# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU
+# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the
+# caller pipeline). Steps:
+#   1. Build and install MSCCL++ on each node.
+#   2. Install a (currently forked) SGLang on each node, replacing any
+#      pre-baked copy from the base image.
+#   3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled.
+# =============================================================================
+
 parameters:
 - name: subscription
   type: string
 - name: vmssName
   type: string
-- name: perfBaselineFile
-  type: string
-  default: 'test/deploy/perf_ndmv4.jsonl'
 - name: containerName
   type: string
   default: 'sglang-mscclpp-test'
-- name: resourceGroup
-  type: string
-- name: hostEntries
-  type: string
 
 steps:
-# - task: Bash@3
-#   displayName: Add HostEntry
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       ENTRY="${{ parameters.hostEntries }}"
-#       if ! grep -qxF "$ENTRY" /etc/hosts; then
-#         echo "Adding to /etc/hosts"
-#         echo "$ENTRY" | sudo tee -a /etc/hosts
-#       else
-#         echo "Entry already exists, nothing to do."
-#       fi
-
-# - template: deploy.yml
-#   parameters:
-#     subscription:     ${{ parameters.subscription }}
-#     vmssName:         ${{ parameters.vmssName }}
-#     deployArgs:       'single-node-test true cuda'
-#     containerName:    ${{ parameters.containerName }}
-#     resourceGroup:    ${{ parameters.resourceGroup }}
 
 - template: run-remote-task.yml
   parameters:
     name: InstallMscclpp
     displayName: Install mscclpp
-    runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
+    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
     remoteScript: |
       rm -rf build
-      mkdir build 
-      cd build 
-      cmake -DCMAKE_BUILD_TYPE=Release .. 
+      mkdir build
+      cd build
+      cmake -DCMAKE_BUILD_TYPE=Release ..
       make -j
-      cd .. 
+      cd ..
       pip install .
       pip install -r ./python/requirements_cuda12.txt
 
+# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
+# Tracking: the fork below (`caiomcbr/sglang` @ release/v0.5.7) is a personal
+# branch and should not remain a long-term CI dependency.
 - template: run-remote-task.yml
   parameters:
     name: InstallSGLang
     displayName: Install SGLang
-    runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
+    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
     remoteScript: |
       # Remove any pre-baked sglang from the container image so all nodes
       # use the freshly cloned fork (otherwise rank 0 imports
       # /sgl-workspace/sglang while rank 1 imports our fork, causing
       # version mismatch and NCCL/CUDA errors).
-      pip uninstall -y sglang sglang-router 2>/dev/null || true
+      pip uninstall -y sglang sglang-router || true
       rm -rf /sgl-workspace/sglang || true
       rm -rf sglang
       git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git
       cd sglang
-      pip install --upgrade pip
       pip install -e "python"
       # Sanity check: confirm sglang resolves to our fork on every node.
       python -c "import sglang, os; p=os.path.dirname(sglang.__file__); print('sglang from:', p); assert '/sgl-workspace' not in p, 'stock sglang still active'"
 
+# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++.
+# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must
+# match across ranks and be free on node 0).
 - template: run-remote-task.yml
   parameters:
-    name: RunSGLangMutliBenchOneBatch1
+    name: RunSGLangMultiBenchOneBatch1
     displayName: Run SGLang Multi-Node Bench One Batch - 1
-    runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
+    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
     remoteScript: |
       export FLASHINFER_DISABLE_VERSION_CHECK=1
       VMSS="${{ parameters.vmssName }}"
       HOSTNAME=$(hostname)
+      # Explicit 2-node mapping: hostname suffix -> SGLang node rank.
       if [ "$HOSTNAME" = "${VMSS}000000" ]; then
         NODE_RANK=0
       elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
@@ -88,4 +79,4 @@ steps:
         echo "Unknown hostname: $HOSTNAME"
         exit 1
       fi
-      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp
\ No newline at end of file
+      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp
diff --git a/.azure-pipelines/templates/sglang-test.yml b/.azure-pipelines/templates/sglang-test.yml
index a3d0c299..6abe6d9a 100644
--- a/.azure-pipelines/templates/sglang-test.yml
+++ b/.azure-pipelines/templates/sglang-test.yml
@@ -1,11 +1,24 @@
+# =============================================================================
+# SGLang single-node test template.
+#
+# Runs on the pipeline agent and dispatches remote steps to a single VMSS GPU
+# node (via run-remote-task.yml). Steps:
+#   1. Deploy: build the test container and bring the VMSS node online.
+#   2. Build and install MSCCL++ on the node.
+#   3. Install a (currently forked) SGLang.
+#   4. Run sglang.bench_one_batch at several batch sizes (kept as separate
+#      steps for per-batch visibility in the Azure DevOps UI).
+#   5. Run a longer end-to-end validation: bring up an sglang server and
+#      drive it with sglang.bench_serving.
+#   6. Run the MSCCL++ all-reduce micro-benchmark via torchrun.
+#   7. Stop / deallocate the VMSS node.
+# =============================================================================
+
 parameters:
 - name: subscription
   type: string
 - name: vmssName
   type: string
-- name: perfBaselineFile
-  type: string
-  default: 'test/deploy/perf_ndmv4.jsonl'
 - name: gpuArch
   type: string
 - name: containerName
@@ -13,83 +26,85 @@ parameters:
   default: 'sglang-mscclpp-test'
 
 steps:
+# deployArgs positional fields: <test-mode> <use-gpu> <cuda|rocm>
 - template: deploy.yml
   parameters:
-    subscription:     ${{ parameters.subscription }}
-    vmssName:         ${{ parameters.vmssName }}
-    gpuArch:          ${{ parameters.gpuArch }}
-    deployArgs:       'single-node-test true cuda'
-    containerName:    ${{ parameters.containerName }}
+    subscription: ${{ parameters.subscription }}
+    vmssName: ${{ parameters.vmssName }}
+    gpuArch: ${{ parameters.gpuArch }}
+    deployArgs: 'single-node-test true cuda'
+    containerName: ${{ parameters.containerName }}
 
 - template: run-remote-task.yml
   parameters:
     name: InstallMscclpp
     displayName: Install mscclpp
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
-      echo "PWD: $(pwd)"
-      ls -la
       rm -rf build
-      mkdir build 
-      cd build 
-      cmake -DCMAKE_BUILD_TYPE=Release .. 
+      mkdir build
+      cd build
+      cmake -DCMAKE_BUILD_TYPE=Release ..
       make -j
-      cd .. 
+      cd ..
       pip install .
       pip install -r ./python/requirements_cuda12.txt
 
+# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
+# Tracking: the fork below (`caiomcbr/sglang` @ main) is a personal branch and
+# should not remain a long-term CI dependency. Also consider pinning to a
+# release branch or commit SHA for reproducibility.
 - template: run-remote-task.yml
   parameters:
     name: InstallSGLang
     displayName: Install SGLang
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
       git clone -b main https://github.com/caiomcbr/sglang.git
       cd sglang/python
-      pip install --upgrade pip
       pip install -e .
 
 - template: run-remote-task.yml
   parameters:
     name: RunSGLangBenchOneBatch1
     displayName: Run SGLang Bench One Batch - 1
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
       export FLASHINFER_DISABLE_VERSION_CHECK=1
-      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp 
+      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
 
 - template: run-remote-task.yml
   parameters:
     name: RunSGLangBenchOneBatch2
     displayName: Run SGLang Bench One Batch - 2
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
       export FLASHINFER_DISABLE_VERSION_CHECK=1
-      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp 
+      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
 
 - template: run-remote-task.yml
   parameters:
     name: RunSGLangBenchOneBatch32
     displayName: Run SGLang Bench One Batch - 32
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
       export FLASHINFER_DISABLE_VERSION_CHECK=1
-      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp 
+      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
 
 - template: run-remote-task.yml
   parameters:
     name: RunSGLangBenchOneBatch64
     displayName: Run SGLang Bench One Batch - 64
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
       export FLASHINFER_DISABLE_VERSION_CHECK=1
-      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp 
+      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
 
 - template: run-remote-task.yml
   parameters:
     name: RunSGLangValidationTest
     displayName: Run SGLang Validation Test
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
       export FLASHINFER_DISABLE_VERSION_CHECK=1
 
@@ -133,7 +148,6 @@ steps:
         if [ $ELAPSED -ge $MAX_WAIT ]; then
           echo "Server did not become ready within ${MAX_WAIT}s. Logs:"
           cat /tmp/sglang_server.log
-          kill $SERVER_PID 2>/dev/null || true
           exit 1
         fi
         sleep 5
@@ -163,17 +177,17 @@ steps:
       echo "Benchmark completed. Results:"
       cat "$RESULTS_DIR/run.jsonl" || true
 
-      # Shut down the server
-      kill $SERVER_PID 2>/dev/null || true
-      wait $SERVER_PID 2>/dev/null || true
-
+# Depends on the `sglang/` source tree cloned by the InstallSGLang step above
+# (steps on the same remote share a working directory).
 - template: run-remote-task.yml
   parameters:
     name: RunSGLangTestAllReduce
     displayName: Run SGLang Test All Reduce
-    runRemoteArgs: '--container sglang-mscclpp-test'
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
     remoteScript: |
       export FLASHINFER_DISABLE_VERSION_CHECK=1
+      # Single-node torchrun: WORLD_SIZE here is the number of *nodes* (1),
+      # not GPUs. nproc_per_node=gpu spawns one rank per local GPU.
       export WORLD_SIZE=1
       export RANK=0
       export MASTER_ADDR=127.0.0.1
@@ -190,4 +204,4 @@ steps:
 - template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
-    vmssName:     ${{ parameters.vmssName }}
\ No newline at end of file
+    vmssName: ${{ parameters.vmssName }}
diff --git a/docker/sglang.dockerfile b/docker/sglang.dockerfile
index 2b7d81b4..f910f6ca 100644
--- a/docker/sglang.dockerfile
+++ b/docker/sglang.dockerfile
@@ -6,11 +6,7 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 
 # Install cmake (not in base image)
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        htop \
-        lcov \
-        vim \
-        && \
+    apt-get install -y --no-install-recommends && \
     apt-get autoremove -y && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*