clean up for PR

This commit is contained in:
empyreus
2026-05-11 21:54:32 +00:00
parent 3b96b5ab6e
commit 97dda7bc1b
7 changed files with 325 additions and 138 deletions

View File

@@ -19,35 +19,50 @@ pr:
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
parameters:
- name: hostEntries
type: string
default: |
10.0.0.10 mscclit-000000
10.0.0.11 mscclit-000001
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: SGlangTest
displayName: SGLANG Test
- job: IntegrationTestA100
displayName: Integration test A100
strategy:
matrix:
sglang:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: msccl-ci
container:
image: $(containerImage)
steps:
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
gpuArch: '80'
- job: IntegrationTestH100
displayName: Integration test H100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: msccl-ci-h100
container:
image: $(containerImage)
steps:
- template: templates/sglang-test.yml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
gpuArch: '90'
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'

View File

@@ -11,19 +11,8 @@ trigger:
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
# Do not run multi-nodes-test for PR, we can trigger it manually
pr: none
parameters:
- name: vmssName
@@ -36,16 +25,16 @@ parameters:
10.0.0.4 mscclpp-h100-multinode-ci000001
jobs:
- job: SGlangTestMultiNode
displayName: SGLANG Test Multi Node
- job: MultiNodesTest
displayName: Multi nodes test
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: mscclpp-multi-node
container:
image: $(containerImage)
image: $[ variables['containerImage'] ]
steps:
- task: Bash@3
@@ -68,7 +57,6 @@ jobs:
inputs:
targetType: 'inline'
script: |
nvidia-smi || echo "nvidia-smi not available on agent"
set -e
VMSS="${{ parameters.vmssName }}"
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
@@ -90,22 +78,46 @@ jobs:
- template: templates/deploy.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp
gpuArch: '90'
deployArgs: 'multi-node-test true cuda'
containerName: 'sglang-mscclpp-test'
gpuArch: '90'
- template: templates/sglang-multi-test.yml
- template: templates/run-remote-task.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-multinode-ci
resourceGroup: mscclpp
hostEntries: ${{ parameters.hostEntries }}
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
continueOnError: true
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh pytests
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
- template: templates/stop.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp

View File

@@ -0,0 +1,141 @@
# =============================================================================
# Multi-node SGLang integration test pipeline.
#
# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
# High-level flow:
# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
# pool. The agent itself has no GPUs.
# 2. SSH/host configuration is generated so the agent can reach the two
# pre-provisioned VMSS GPU nodes.
# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
# 5. `templates/stop.yml` tears down / stops the VMSS nodes.
#
# Docs / non-code changes are excluded from triggering this pipeline.
# =============================================================================
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
parameters:
# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
- name: vmssName
type: string
default: mscclpp-h100-multinode-ci
# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
# These IPs are tied to the specific VMSS above; update both together if the
# VMSS is reprovisioned or renamed.
- name: hostEntries
type: string
default: |
10.0.0.5 mscclpp-h100-multinode-ci000000
10.0.0.4 mscclpp-h100-multinode-ci000001
jobs:
- job: SGlangTestMultiNode
displayName: SGLANG Test Multi Node
# Matrix is kept (despite having a single entry today) to make it easy to
# add more variants (e.g. cuda13, rocm) without restructuring the job.
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
pool:
name: mscclpp-multi-node
container:
image: $(containerImage)
steps:
# Ensure the VMSS node hostnames resolve from the pipeline agent container.
# Idempotent: only appends lines that are not already present in /etc/hosts.
- task: Bash@3
displayName: Add HostEntry
inputs:
targetType: 'inline'
script: |
while IFS= read -r line; do
[ -z "$line" ] && continue
if ! grep -qxF "$line" /etc/hosts; then
echo "Adding to /etc/hosts: $line"
echo "$line" | sudo tee -a /etc/hosts
else
echo "Entry already exists: $line"
fi
done <<< "${{ parameters.hostEntries }}"
# Generate the SSH config and hostfiles consumed by the deploy / test
# templates below:
# - config : SSH client config (custom port + key) for each node
# - hostfile : user@host list used by deploy / test scripts
# - hostfile_mpi : bare hostnames used by mpirun
- task: Bash@3
displayName: Generate deploy files
inputs:
targetType: 'inline'
script: |
set -e
VMSS="${{ parameters.vmssName }}"
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
NODE0="${VMSS}000000"
NODE1="${VMSS}000001"
echo "Host ${NODE0}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
Host ${NODE1}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
# Build MSCCL++ and deploy it onto the VMSS GPU nodes.
- template: templates/deploy.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp
gpuArch: '90'
deployArgs: 'multi-node-test true cuda'
containerName: 'sglang-mscclpp-test'
# Run the SGLang multi-node tests across the two GPU nodes.
- template: templates/sglang-multi-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
# Stop/deallocate the VMSS GPU nodes to release resources.
- template: templates/stop.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp

View File

@@ -1,3 +1,13 @@
# =============================================================================
# Single-node SGLang integration test pipeline.
#
# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
# pool. All deploy / run / teardown logic is delegated to
# `templates/sglang-test.yml`.
#
# Docs / non-code changes are excluded from triggering this pipeline.
# =============================================================================
trigger:
branches:
include:
@@ -19,28 +29,36 @@ pr:
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
parameters:
# Name of the pre-provisioned Azure VMSS that hosts the GPU test node.
- name: vmssName
type: string
default: mscclpp-h100-ci
jobs:
- job: sglangtest
- job: SGlangTest
displayName: SGLANG Test
# Matrix is kept (despite having a single entry today) to make it easy to
# add more variants (e.g. cuda13, rocm) without restructuring the job.
strategy:
matrix:
sglang:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-sglang-x86_64
pool:
name: msccl-ci
name: msccl-ci-h100
container:
image: $(containerImage)
steps:
# Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
- template: templates/sglang-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
gpuArch: '80'
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
gpuArch: '90'

View File

@@ -1,85 +1,76 @@
# =============================================================================
# SGLang multi-node test template.
#
# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU
# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the
# caller pipeline). Steps:
# 1. Build and install MSCCL++ on each node.
# 2. Install a (currently forked) SGLang on each node, replacing any
# pre-baked copy from the base image.
# 3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled.
# =============================================================================
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: containerName
type: string
default: 'sglang-mscclpp-test'
- name: resourceGroup
type: string
- name: hostEntries
type: string
steps:
# - task: Bash@3
# displayName: Add HostEntry
# inputs:
# targetType: 'inline'
# script: |
# ENTRY="${{ parameters.hostEntries }}"
# if ! grep -qxF "$ENTRY" /etc/hosts; then
# echo "Adding to /etc/hosts"
# echo "$ENTRY" | sudo tee -a /etc/hosts
# else
# echo "Entry already exists, nothing to do."
# fi
# - template: deploy.yml
# parameters:
# subscription: ${{ parameters.subscription }}
# vmssName: ${{ parameters.vmssName }}
# deployArgs: 'single-node-test true cuda'
# containerName: ${{ parameters.containerName }}
# resourceGroup: ${{ parameters.resourceGroup }}
- template: run-remote-task.yml
parameters:
name: InstallMscclpp
displayName: Install mscclpp
runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
remoteScript: |
rm -rf build
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
cd ..
cd ..
pip install .
pip install -r ./python/requirements_cuda12.txt
# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
# Tracking: the fork below (`caiomcbr/sglang` @ release/v0.5.7) is a personal
# branch and should not remain a long-term CI dependency.
- template: run-remote-task.yml
parameters:
name: InstallSGLang
displayName: Install SGLang
runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
remoteScript: |
# Remove any pre-baked sglang from the container image so all nodes
# use the freshly cloned fork (otherwise rank 0 imports
# /sgl-workspace/sglang while rank 1 imports our fork, causing
# version mismatch and NCCL/CUDA errors).
pip uninstall -y sglang sglang-router 2>/dev/null || true
pip uninstall -y sglang sglang-router || true
rm -rf /sgl-workspace/sglang || true
rm -rf sglang
git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git
cd sglang
pip install --upgrade pip
pip install -e "python"
# Sanity check: confirm sglang resolves to our fork on every node.
python -c "import sglang, os; p=os.path.dirname(sglang.__file__); print('sglang from:', p); assert '/sgl-workspace' not in p, 'stock sglang still active'"
# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++.
# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must
# match across ranks and be free on node 0).
- template: run-remote-task.yml
parameters:
name: RunSGLangMutliBenchOneBatch1
name: RunSGLangMultiBenchOneBatch1
displayName: Run SGLang Multi-Node Bench One Batch - 1
runRemoteArgs: '--container sglang-mscclpp-test --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
VMSS="${{ parameters.vmssName }}"
HOSTNAME=$(hostname)
# Explicit 2-node mapping: hostname suffix -> SGLang node rank.
if [ "$HOSTNAME" = "${VMSS}000000" ]; then
NODE_RANK=0
elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
@@ -88,4 +79,4 @@ steps:
echo "Unknown hostname: $HOSTNAME"
exit 1
fi
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp

View File

@@ -1,11 +1,24 @@
# =============================================================================
# SGLang single-node test template.
#
# Runs on the pipeline agent and dispatches remote steps to a single VMSS GPU
# node (via run-remote-task.yml). Steps:
# 1. Deploy: build the test container and bring the VMSS node online.
# 2. Build and install MSCCL++ on the node.
# 3. Install a (currently forked) SGLang.
# 4. Run sglang.bench_one_batch at several batch sizes (kept as separate
# steps for per-batch visibility in the Azure DevOps UI).
# 5. Run a longer end-to-end validation: bring up an sglang server and
# drive it with sglang.bench_serving.
# 6. Run the MSCCL++ all-reduce micro-benchmark via torchrun.
# 7. Stop / deallocate the VMSS node.
# =============================================================================
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string
- name: containerName
@@ -13,83 +26,85 @@ parameters:
default: 'sglang-mscclpp-test'
steps:
# deployArgs positional fields: <test-mode> <use-gpu> <cuda|rocm>
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true cuda'
containerName: ${{ parameters.containerName }}
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true cuda'
containerName: ${{ parameters.containerName }}
- template: run-remote-task.yml
parameters:
name: InstallMscclpp
displayName: Install mscclpp
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
echo "PWD: $(pwd)"
ls -la
rm -rf build
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
cd ..
cd ..
pip install .
pip install -r ./python/requirements_cuda12.txt
# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
# Tracking: the fork below (`caiomcbr/sglang` @ main) is a personal branch and
# should not remain a long-term CI dependency. Also consider pinning to a
# release branch or commit SHA for reproducibility.
- template: run-remote-task.yml
parameters:
name: InstallSGLang
displayName: Install SGLang
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
git clone -b main https://github.com/caiomcbr/sglang.git
cd sglang/python
pip install --upgrade pip
pip install -e .
- template: run-remote-task.yml
parameters:
name: RunSGLangBenchOneBatch1
displayName: Run SGLang Bench One Batch - 1
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
- template: run-remote-task.yml
parameters:
name: RunSGLangBenchOneBatch2
displayName: Run SGLang Bench One Batch - 2
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 2 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
- template: run-remote-task.yml
parameters:
name: RunSGLangBenchOneBatch32
displayName: Run SGLang Bench One Batch - 32
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 32 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
- template: run-remote-task.yml
parameters:
name: RunSGLangBenchOneBatch64
displayName: Run SGLang Bench One Batch - 64
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 64 --input-len 256 --output-len 256 --tp-size 8 --disable-custom-all-reduce --enable-mscclpp
- template: run-remote-task.yml
parameters:
name: RunSGLangValidationTest
displayName: Run SGLang Validation Test
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
@@ -133,7 +148,6 @@ steps:
if [ $ELAPSED -ge $MAX_WAIT ]; then
echo "Server did not become ready within ${MAX_WAIT}s. Logs:"
cat /tmp/sglang_server.log
kill $SERVER_PID 2>/dev/null || true
exit 1
fi
sleep 5
@@ -163,17 +177,17 @@ steps:
echo "Benchmark completed. Results:"
cat "$RESULTS_DIR/run.jsonl" || true
# Shut down the server
kill $SERVER_PID 2>/dev/null || true
wait $SERVER_PID 2>/dev/null || true
# Depends on the `sglang/` source tree cloned by the InstallSGLang step above
# (steps on the same remote share a working directory).
- template: run-remote-task.yml
parameters:
name: RunSGLangTestAllReduce
displayName: Run SGLang Test All Reduce
runRemoteArgs: '--container sglang-mscclpp-test'
runRemoteArgs: '--container ${{ parameters.containerName }}'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
# Single-node torchrun: WORLD_SIZE here is the number of *nodes* (1),
# not GPUs. nproc_per_node=gpu spawns one rank per local GPU.
export WORLD_SIZE=1
export RANK=0
export MASTER_ADDR=127.0.0.1
@@ -190,4 +204,4 @@ steps:
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -6,11 +6,7 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
# Install cmake (not in base image)
RUN apt-get update && \
apt-get install -y --no-install-recommends \
htop \
lcov \
vim \
&& \
apt-get install -y --no-install-recommends && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*