Files
sglang/docker/Dockerfile
Liangsheng Yin 35870d55ac Deepseek V4 (#23882)
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: fzyzcjy <ch271828n@outlook.com>
Co-authored-by: ispobock <ispobaoke@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: yueming-yuan <yym022502@gmail.com>
Co-authored-by: DarkSharpness <2040703891@qq.com>
Co-authored-by: Yuhao Yang <47235274+yhyang201@users.noreply.github.com>
Co-authored-by: yhyang201 <yhyang201@users.noreply.github.com>
Co-authored-by: yhyang201 <yhyang201@gmail.com>
Co-authored-by: Qiaolin Yu <90088090+qiaolin-yu@users.noreply.github.com>
Co-authored-by: Ethan (Yusheng) Su <11704492+yushengsu-thu@users.noreply.github.com>
Co-authored-by: Mingyi <27337995+wisclmy0611@users.noreply.github.com>
Co-authored-by: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Co-authored-by: Yihao Wang <42559837+againstentropy@users.noreply.github.com>
2026-05-07 18:32:21 -07:00

823 lines
33 KiB
Docker

ARG CUDA_VERSION=13.0.1
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS base
ARG TARGETARCH
ARG BUILD_TYPE=all
ARG BRANCH_TYPE=remote
ARG GRACE_BLACKWELL=0
ARG HOPPER_SBO=0
ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2
ARG HOPPER_SBO_DEEPEP_COMMIT=9f2fc4b3182a51044ae7ecb6610f7c9c3258c4d6
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
ARG BUILD_AND_DOWNLOAD_PARALLEL=8
ARG SGL_KERNEL_VERSION=0.4.2.post1
ARG SGL_VERSION
ARG SGL_DEEP_GEMM_VERSION=0.0.1
ARG USE_LATEST_SGLANG=0
ARG GDRCOPY_VERSION=2.5.1
ARG PIP_DEFAULT_INDEX
ARG UBUNTU_MIRROR
ARG GITHUB_ARTIFACTORY=github.com
ARG INSTALL_FLASHINFER_JIT_CACHE=0
ARG FLASHINFER_VERSION=0.6.8.post1
ARG MOONCAKE_VERSION=0.3.10.post2
#if need other arg please add in MOONCAKE_COMPILE_ARG
ARG MOONCAKE_COMPILE_ARG="-DUSE_HTTP=ON -DUSE_MNNVL=ON -DUSE_CUDA=ON -DWITH_EP=ON"
ENV DEBIAN_FRONTEND=noninteractive \
CUDA_HOME=/usr/local/cuda \
GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/ \
FLASHINFER_VERSION=${FLASHINFER_VERSION}
# Add GKE default lib and bin locations
ENV PATH="${PATH}:/usr/local/nvidia/bin" \
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
# Replace Ubuntu sources if specified
RUN if [ -n "$UBUNTU_MIRROR" ]; then \
sed -i "s|http://.*archive.ubuntu.com|$UBUNTU_MIRROR|g" /etc/apt/sources.list && \
sed -i "s|http://.*security.ubuntu.com|$UBUNTU_MIRROR|g" /etc/apt/sources.list; \
fi
# Python setup (combined with apt update to reduce layers)
# Ubuntu 24.04 ships Python 3.12 in main, so we no longer need the deadsnakes
# PPA. Dropping it avoids transient Launchpad 504s in `add-apt-repository`.
RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \
apt update && apt install -y --no-install-recommends wget software-properties-common \
&& apt install -y --no-install-recommends python3.12-full python3.12-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \
&& update-alternatives --set python3 /usr/bin/python3.12 \
&& wget -q https://bootstrap.pypa.io/get-pip.py \
&& python3 get-pip.py --break-system-packages \
&& rm get-pip.py \
# Allow pip to install packages globally (PEP 668 workaround for Ubuntu 24.04)
&& python3 -m pip config set global.break-system-packages true \
# Fix for apt-add-repository
&& cd /usr/lib/python3/dist-packages/ \
&& ln -s apt_pkg.cpython-312-*-linux-gnu.so apt_pkg.so
# Install system dependencies (organized by category for better caching)
RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \
apt-get update && apt-get install -y --no-install-recommends \
# Core system utilities
ca-certificates \
software-properties-common \
netcat-openbsd \
kmod \
unzip \
openssh-server \
curl \
wget \
lsof \
locales \
# Build essentials (needed for framework stage)
build-essential \
cmake \
perl \
patchelf \
ccache \
git-lfs \
# MPI and NUMA
libopenmpi-dev \
libnuma1 \
libnuma-dev \
numactl \
# transformers multimodal VLM
ffmpeg \
# InfiniBand/RDMA
libibverbs-dev \
libibverbs1 \
libibumad3 \
librdmacm1 \
libnl-3-200 \
libnl-route-3-200 \
libnl-route-3-dev \
libnl-3-dev \
ibverbs-providers \
infiniband-diags \
perftest \
# Development libraries
libgoogle-glog-dev \
libgtest-dev \
libjsoncpp-dev \
libunwind-dev \
libboost-all-dev \
libssl-dev \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler \
protobuf-compiler-grpc \
pybind11-dev \
libhiredis-dev \
libcurl4-openssl-dev \
libczmq4 \
libczmq-dev \
libfabric-dev \
linux-libc-dev \
# Package building tools
devscripts \
debhelper \
fakeroot \
dkms \
check \
libsubunit0 \
libsubunit-dev \
&& ln -sf /usr/bin/python3.12 /usr/bin/python \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Replace pip global cache if specified
RUN if [ -n "${PIP_DEFAULT_INDEX}" ]; then \
python3 -m pip config set global.index-url ${PIP_DEFAULT_INDEX}; \
fi
# GDRCopy installation
RUN mkdir -p /tmp/gdrcopy && cd /tmp \
&& curl --retry 3 --retry-delay 2 -fsSL -o v${GDRCOPY_VERSION}.tar.gz \
https://${GITHUB_ARTIFACTORY}/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
&& tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
&& cd gdrcopy-${GDRCOPY_VERSION}/packages \
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
&& cd / && rm -rf /tmp/gdrcopy
# Fix DeepEP IBGDA symlink
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
# Set up locale
RUN locale-gen en_US.UTF-8
ENV LANG=en_US.UTF-8 \
LANGUAGE=en_US:en \
LC_ALL=en_US.UTF-8
########################################################
########## PARALLEL BUILDER STAGES ####################
########################################################
#
# These stages run IN PARALLEL via BuildKit:
#
# base
# |
# +-- torch_deps ------> deepep_builder (needs torch)
# | \-> flashinfer_cache (needs flashinfer)
# |
# +-- devtools_builder (independent)
# +-- gateway_builder (independent, only needs gateway source)
# |
# v
# framework (combines all artifacts)
#
########################################################
# PARALLEL STAGE 1: Torch/Deps Builder (starts from base)
########################################################
FROM base AS torch_deps
ARG CUDA_VERSION
ARG BUILD_TYPE
ARG SGL_KERNEL_VERSION
ARG GITHUB_ARTIFACTORY
WORKDIR /sgl-workspace
# Rust toolchain for setuptools-rust extensions (e.g. sglang-grpc).
# Requires >= 1.85 (edition 2024). Inherited by framework via FROM torch_deps.
ENV PATH="/root/.cargo/bin:${PATH}"
RUN curl --proto '=https' --tlsv1.2 --retry 3 --retry-delay 2 -sSf https://sh.rustup.rs \
| sh -s -- -y --no-modify-path --profile minimal \
&& rustc --version && cargo --version
# Install sgl-kernel (from pre-built wheel)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip setuptools wheel html5lib six \
&& case "$CUDA_VERSION" in \
12.6.1) CUINDEX=126 ;; \
12.8.1) CUINDEX=128 ;; \
12.9.1) CUINDEX=129 ;; \
13.0.1) CUINDEX=130 ;; \
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
esac \
&& if [ "$CUDA_VERSION" = "12.6.1" ]; then \
python3 -m pip install https://${GITHUB_ARTIFACTORY}/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sglang_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
; \
elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
python3 -m pip install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sglang_kernel-${SGL_KERNEL_VERSION}+cu129-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
; \
elif [ "$CUDA_VERSION" = "13.0.1" ]; then \
# --no-deps prevents pip from pulling torch from default PyPI
python3 -m pip install sglang-kernel==${SGL_KERNEL_VERSION} --force-reinstall --no-deps \
; \
else \
echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
; \
fi
# Copy dep spec + Rust crate source + proto files. setuptools-rust compiles the
# Rust extension during the stub wheel build; the crate's build.rs references
# ../../proto for tonic_build. Split from the pip install so source changes to
# these paths invalidate the dep-install layer, but Python source changes don't.
COPY python/pyproject.toml /tmp/sglang_deps/python/pyproject.toml
COPY rust/sglang-grpc /tmp/sglang_deps/rust/sglang-grpc
COPY proto /tmp/sglang_deps/proto
# Install sglang dependencies (torch, transformers, etc.)
# Generate constraints.txt to prevent reinstalling these deps in later stages
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cargo/registry \
case "$CUDA_VERSION" in \
12.6.1) CUINDEX=126 ;; \
12.8.1) CUINDEX=128 ;; \
12.9.1) CUINDEX=129 ;; \
13.0.1) CUINDEX=130 ;; \
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
esac \
&& cd /tmp/sglang_deps/python \
&& mkdir -p sglang \
&& touch sglang/__init__.py \
&& echo '__version__ = "0.0.0"' > sglang/version.py \
&& touch README.md \
&& touch LICENSE \
&& python3 -m pip install --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} ".[${BUILD_TYPE}]" \
&& if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
pip list --format=freeze | awk -F'==' '/-cu13(==|$)/ {print $1}' \
| xargs -r python3 -m pip uninstall -y && \
python3 -m pip install --index-url https://download.pytorch.org/whl/cu${CUINDEX} \
torch torchvision torchaudio --force-reinstall; \
python3 -m pip install https://github.com/sgl-project/whl/releases/download/v${SGL_DEEP_GEMM_VERSION}/sgl_deep_gemm-${SGL_DEEP_GEMM_VERSION}+cu129-py3-none-manylinux2014_$(uname -m).whl --force-reinstall; \
fi \
&& cd /sgl-workspace \
&& rm -rf /tmp/sglang_deps \
&& pip freeze | grep -v "^sglang==" > /sgl-workspace/constraints.txt
########################################################
# PARALLEL STAGE 2: DeepEP Builder (needs torch_deps)
########################################################
FROM torch_deps AS deepep_builder
ARG CUDA_VERSION
ARG BUILD_AND_DOWNLOAD_PARALLEL
ARG GRACE_BLACKWELL
ARG GRACE_BLACKWELL_DEEPEP_BRANCH
ARG HOPPER_SBO
ARG HOPPER_SBO_DEEPEP_COMMIT
ARG DEEPEP_COMMIT
ARG GITHUB_ARTIFACTORY
WORKDIR /build
# Clone DeepEP
RUN set -eux; \
if [ "$GRACE_BLACKWELL" = "1" ]; then \
git clone https://github.com/fzyzcjy/DeepEP.git && \
cd DeepEP && \
git checkout ${GRACE_BLACKWELL_DEEPEP_BRANCH} && \
sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
sed -i 's/#define NUM_TIMEOUT_CYCLES 200000000000ull/#define NUM_TIMEOUT_CYCLES 2000000000000ull/' csrc/kernels/configs.cuh && \
cd .. ; \
elif [ "$HOPPER_SBO" = "1" ]; then \
git clone https://github.com/deepseek-ai/DeepEP.git -b antgroup-opt && \
cd DeepEP && \
git checkout ${HOPPER_SBO_DEEPEP_COMMIT} && \
sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
sed -i 's/#define NUM_TIMEOUT_CYCLES 200000000000ull/#define NUM_TIMEOUT_CYCLES 2000000000000ull/' csrc/kernels/configs.cuh && \
cd .. ; \
else \
curl --retry 3 --retry-delay 2 -fsSL -o ${DEEPEP_COMMIT}.zip \
https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \
unzip -q ${DEEPEP_COMMIT}.zip && rm ${DEEPEP_COMMIT}.zip && mv DeepEP-${DEEPEP_COMMIT} DeepEP && cd DeepEP && \
sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
sed -i 's/#define NUM_TIMEOUT_CYCLES 200000000000ull/#define NUM_TIMEOUT_CYCLES 2000000000000ull/' csrc/kernels/configs.cuh && \
cd .. ; \
fi
# Build DeepEP wheel
RUN --mount=type=cache,target=/root/.cache/pip \
cd /build/DeepEP && \
case "$CUDA_VERSION" in \
12.6.1) \
CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \
;; \
12.8.1) \
CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' \
;; \
12.9.1|13.0.1) \
CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' \
;; \
*) \
echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
;; \
esac && \
if [ "${CUDA_VERSION%%.*}" = "13" ]; then \
sed -i "/^ include_dirs = \['csrc\/'\]/a\ include_dirs.append('${CUDA_HOME}/include/cccl')" setup.py; \
fi && \
TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" MAX_JOBS=${BUILD_AND_DOWNLOAD_PARALLEL} \
python3 setup.py bdist_wheel -d /wheels
########################################################
# PARALLEL STAGE 3: FlashInfer Cache (needs torch_deps)
########################################################
FROM torch_deps AS flashinfer_cache
ARG CUDA_VERSION
ARG INSTALL_FLASHINFER_JIT_CACHE
ARG FLASHINFER_VERSION
# Stage jit-cache artifacts into /flashinfer_jit_output for clean COPY later
RUN --mount=type=cache,target=/root/.cache/pip \
case "$CUDA_VERSION" in \
12.6.1) CUINDEX=126 ;; \
12.8.1) CUINDEX=128 ;; \
12.9.1) CUINDEX=129 ;; \
13.0.1) CUINDEX=130 ;; \
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
esac \
&& mkdir -p /flashinfer_jit_output \
&& if [ "$INSTALL_FLASHINFER_JIT_CACHE" = "1" ]; then \
python3 -m pip install flashinfer-jit-cache==${FLASHINFER_VERSION} --index-url https://flashinfer.ai/whl/cu${CUINDEX} \
&& cp -r /usr/local/lib/python3.12/dist-packages/flashinfer_jit_cache /flashinfer_jit_output/ \
&& cp -r /usr/local/lib/python3.12/dist-packages/flashinfer_jit_cache-*.dist-info /flashinfer_jit_output/ ; \
fi
########################################################
# PARALLEL STAGE 4: Dev Tools Builder (starts from base)
########################################################
FROM base AS devtools_builder
ARG GITHUB_ARTIFACTORY
WORKDIR /tools
# Minimal apt deps needed for oh-my-zsh install in this stage
# Full dev apt packages (gdb, vim, tmux, nsight, etc.) are installed in the framework stage
RUN --mount=type=cache,target=/var/cache/apt,id=devtools-apt \
apt-get update && apt-get install -y --no-install-recommends zsh git \
&& rm -rf /var/lib/apt/lists/*
# Download CLI tools (each in its own layer for parallel downloads)
RUN curl --retry 3 --retry-delay 2 -LSso /tools/diff-so-fancy \
https://${GITHUB_ARTIFACTORY}/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
&& chmod +x /tools/diff-so-fancy
RUN curl --retry 3 --retry-delay 2 -LSso /tools/clang-format \
https://${GITHUB_ARTIFACTORY}/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
&& chmod +x /tools/clang-format
RUN curl --retry 3 --retry-delay 2 -fsSL -o /tmp/clangd.zip \
https://${GITHUB_ARTIFACTORY}/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip \
&& unzip -q /tmp/clangd.zip -d /tmp \
&& cp /tmp/clangd_18.1.3/bin/* /tools/ \
&& mkdir -p /tools/lib && cp -r /tmp/clangd_18.1.3/lib/* /tools/lib/ \
&& rm -rf /tmp/clangd.zip /tmp/clangd_18.1.3
RUN CMAKE_VERSION=3.31.1 \
&& ARCH=$(uname -m) \
&& CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \
&& curl --retry 3 --retry-delay 2 -fsSL -o "/tmp/${CMAKE_INSTALLER}.tar.gz" \
"https://${GITHUB_ARTIFACTORY}/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \
&& tar -xzf "/tmp/${CMAKE_INSTALLER}.tar.gz" -C /tmp \
&& cp -r "/tmp/${CMAKE_INSTALLER}/bin/"* /tools/ \
&& mkdir -p /tools/share && cp -r "/tmp/${CMAKE_INSTALLER}/share/"* /tools/share/ \
&& rm -rf "/tmp/${CMAKE_INSTALLER}" "/tmp/${CMAKE_INSTALLER}.tar.gz"
RUN curl --proto '=https' --tlsv1.2 --retry 3 --retry-delay 2 -sSf https://just.systems/install.sh | \
sed "s|https://github.com|https://${GITHUB_ARTIFACTORY}|g" | \
bash -s -- --tag 1.42.4 --to /tools
# Install oh-my-zsh and plugins
RUN sh -c "$(curl --retry 3 --retry-delay 2 -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \
&& git clone --depth 1 https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-/root/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
&& git clone --depth 1 https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-/root/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting
########################################################
# PARALLEL STAGE 5: Gateway Builder (starts from base)
########################################################
# Builds sgl-model-gateway in isolation so Python-only changes
# don't trigger a full Rust recompilation.
FROM base AS gateway_builder
ARG GITHUB_ARTIFACTORY
ARG BRANCH_TYPE
ARG SGL_VERSION
ARG USE_LATEST_SGLANG
WORKDIR /build
# Copy ONLY the gateway source (not the full repo)
COPY sgl-model-gateway /build/sgl-model-gateway
# Install Rust, build gateway binary and Python bindings, then clean up Rust toolchain
RUN --mount=type=cache,target=/root/.cache/pip \
curl --proto '=https' --tlsv1.2 --retry 3 --retry-delay 2 -sSf https://sh.rustup.rs | sh -s -- -y \
&& export PATH="/root/.cargo/bin:${PATH}" \
&& python3 -m pip install maturin \
&& cd /build/sgl-model-gateway/bindings/python \
&& ulimit -n 65536 && maturin build --release --features vendored-openssl --out /build/gateway_wheels \
&& cd /build/sgl-model-gateway \
&& cargo build --release --bin sgl-model-gateway --features vendored-openssl \
&& cp target/release/sgl-model-gateway /build/sgl-model-gateway-bin \
&& rm -rf /root/.cargo /root/.rustup /build/sgl-model-gateway/target /build/sgl-model-gateway/bindings/python/target
########################################################
########## Final Framework Image ######################
########################################################
#
# Combines all artifacts from parallel builder stages
#
FROM torch_deps AS framework
ARG BRANCH_TYPE
ARG BUILD_TYPE
ARG CUDA_VERSION
ARG BUILD_AND_DOWNLOAD_PARALLEL
ARG SGL_VERSION
ARG USE_LATEST_SGLANG
ARG GITHUB_ARTIFACTORY
ARG MOONCAKE_VERSION
ARG MOONCAKE_COMPILE_ARG
WORKDIR /sgl-workspace
# =============================================================================
# Copy artifacts from parallel builders
# =============================================================================
# Copy DeepEP wheel and install
COPY --from=deepep_builder /wheels /tmp/wheels/deepep
COPY --from=deepep_builder /build/DeepEP /sgl-workspace/DeepEP
RUN --mount=type=cache,target=/root/.cache/pip \
pip install /tmp/wheels/deepep/*.whl && rm -rf /tmp/wheels/deepep
# Copy flashinfer jit-cache package (if installed)
COPY --from=flashinfer_cache /flashinfer_jit_output/ /usr/local/lib/python3.12/dist-packages/
# Copy dev tools
COPY --from=devtools_builder /tools/diff-so-fancy /usr/local/bin/
COPY --from=devtools_builder /tools/clang-format /usr/local/bin/
COPY --from=devtools_builder /tools/clangd /usr/local/bin/
COPY --from=devtools_builder /tools/lib /usr/local/lib/
COPY --from=devtools_builder /tools/cmake /usr/local/bin/
COPY --from=devtools_builder /tools/ctest /usr/local/bin/
COPY --from=devtools_builder /tools/cpack /usr/local/bin/
COPY --from=devtools_builder /tools/share/cmake-3.31 /usr/local/share/cmake-3.31
COPY --from=devtools_builder /tools/just /usr/local/bin/
COPY --from=devtools_builder /root/.oh-my-zsh /root/.oh-my-zsh
# Install dev apt packages (need to re-run since we're in a different stage)
RUN --mount=type=cache,target=/var/cache/apt,id=framework-apt \
apt-get update && apt-get install -y --no-install-recommends \
gdb \
ninja-build \
vim \
tmux \
htop \
zsh \
tree \
silversearcher-ag \
cloc \
pkg-config \
bear \
less \
rdma-core \
openssh-server \
gnuplot \
infiniband-diags \
perftest \
ibverbs-providers \
libibumad3 \
libibverbs1 \
libnl-3-200 \
libnl-route-3-200 \
librdmacm1 \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Install NVIDIA development tools
RUN --mount=type=cache,target=/var/cache/apt,id=framework-apt \
apt update -y \
&& apt install -y --no-install-recommends gnupg \
&& echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
&& apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "x86_64"; fi)/7fa2af80.pub \
&& apt update -y \
&& apt install -y --no-install-recommends nsight-systems-cli \
&& rm -rf /var/lib/apt/lists/*
# =============================================================================
# Python packages and tools (before source copy for better caching)
# =============================================================================
# Install Mooncake
RUN --mount=type=cache,target=/root/.cache/pip \
CUDA_MAJOR="${CUDA_VERSION%%.*}" && \
if [ "$CUDA_MAJOR" -ge 13 ]; then \
python3 -m pip install mooncake-transfer-engine-cuda13==${MOONCAKE_VERSION}; \
else \
python3 -m pip install mooncake-transfer-engine==${MOONCAKE_VERSION}; \
fi
# Install essential Python packages (use constraints to prevent conflicts)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -c /sgl-workspace/constraints.txt \
datamodel_code_generator \
pre-commit \
pytest \
black \
isort \
icdiff \
uv \
wheel \
scikit-build-core \
py-spy \
cubloaty \
google-cloud-storage \
pandas \
matplotlib \
tabulate \
termplotlib \
"runai-model-streamer[s3,gcs,azure]>=0.15.7"
# Per-CUDA-major package installs. The `nixl` stub package is needed (it owns
# the `nixl` import path) but unconditionally requires nixl-cu12, so we install
# it with --no-deps and pair it with the matching nixl-cu12 / nixl-cu13 binary
# to avoid shipping wrong-CUDA libs on cu13 images.
# The upstream flash-mla packages are required for running deepseek-v4 models
RUN --mount=type=cache,target=/root/.cache/pip if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
python3 -m pip install nixl nixl-cu12 --no-deps ; \
python3 -m pip install cuda-python==12.9 ; \
cd /sgl-workspace && git clone https://github.com/deepseek-ai/FlashMLA.git flash-mla \
&& cd flash-mla && git submodule update --init --recursive \
&& pip install --no-build-isolation -v . ; \
elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
python3 -m pip install nixl nixl-cu13 --no-deps ; \
python3 -m pip install cuda-python==13.2.0 ; \
cd /sgl-workspace && git clone https://github.com/deepseek-ai/FlashMLA.git flash-mla \
&& ln -s /usr/local/cuda/include/cccl/cuda /usr/local/cuda/include/cuda \
&& cd flash-mla && git submodule update --init --recursive \
&& pip install --no-build-isolation -v . ; \
fi
# Add yank script
COPY --chown=root:root --chmod=755 docker/configs/yank /usr/local/bin/yank
# These configs are optional; users can override them by mounting their own files
COPY docker/configs/opt/.vimrc /opt/sglang/.vimrc
COPY docker/configs/opt/.tmux.conf /opt/sglang/.tmux.conf
COPY docker/configs/opt/.gitconfig /opt/sglang/.gitconfig
# Configure development environment
COPY docker/configs/.zshrc /root/.zshrc
# Fix Trivy-reported CVEs
# pip: urllib3 (CVE-2025-43859), pillow (CVE-2026-25990)
# binutils family: CVE-2025-{1147,1148,3198,5244,5245,7545,7546,8225,11082,11083,11412,11413,11414,11494,11839,11840}
# libgnutls30t64: CVE-2025-{9820,14831}
# libpam: CVE-2024-10963
# libsqlite3-0: CVE-2025-{6965,7709}
# libtasn1-6: CVE-2025-13151
# dpkg: CVE-2025-6297
RUN python3 -m pip install --upgrade "urllib3>=2.6.3" "pillow>=12.1.1"
RUN --mount=type=cache,target=/var/cache/apt,id=framework-apt \
apt-get update && apt-get install -y --only-upgrade \
binutils binutils-common binutils-x86-64-linux-gnu libbinutils \
libctf0 libctf-nobfd0 libgprofng0 libsframe1 \
libgnutls30t64 \
libpam-modules libpam-modules-bin libpam-runtime libpam0g \
libsqlite3-0 libtasn1-6 \
dpkg dpkg-dev libdpkg-perl \
&& rm -rf /var/lib/apt/lists/*
# =============================================================================
# Copy sglang source and do editable install (LAST for better caching)
# =============================================================================
# Copy local source if building from local
FROM scratch AS local_src
COPY . /src
FROM framework AS framework_final
ARG BRANCH_TYPE
ARG BUILD_TYPE
ARG CUDA_VERSION
ARG SGL_VERSION
ARG USE_LATEST_SGLANG
WORKDIR /sgl-workspace
COPY --from=local_src /src /tmp/local_src
RUN if [ "$BRANCH_TYPE" = "local" ]; then \
cp -r /tmp/local_src /sgl-workspace/sglang; \
elif [ "$USE_LATEST_SGLANG" = "1" ]; then \
git clone --depth=1 https://github.com/sgl-project/sglang.git /sgl-workspace/sglang; \
elif [ -z "$SGL_VERSION" ]; then \
echo "ERROR: SGL_VERSION must be set when USE_LATEST_SGLANG=0 and BRANCH_TYPE!=local" && exit 1; \
else \
git clone --depth=1 --branch v${SGL_VERSION} https://github.com/sgl-project/sglang.git /sgl-workspace/sglang; \
fi \
&& rm -rf /tmp/local_src
# Editable install (fast - dependencies already installed via constraints)
# Clean up __pycache__/tests/pyc in same RUN to avoid writing ~28k files to layer
RUN --mount=type=cache,target=/root/.cache/pip \
cd /sgl-workspace/sglang \
&& python3 -m pip install --no-deps -e "python[${BUILD_TYPE}]" \
&& kernels lock python \
&& ( success=0; \
# aarch64: kernels-community/sgl-flash-attn3 ships no arm variants; JIT-compile at runtime.
# Remove this branch once arm cubins are published upstream.
if [ "$(uname -m)" = "aarch64" ]; then \
echo "Skipping kernels-community/sgl-flash-attn3 cubin download on aarch64 (no variants published upstream); kernels will be JIT-compiled at runtime"; \
success=1; \
else \
for i in 1 2 3; do \
echo "Attempt $i/3: downloading sgl-kernel cubins..." && \
kernels download python && \
success=1 && break; \
echo "sgl-kernel cubin download failed, retrying in 30s..." && sleep 30; \
done; \
fi; \
[ "$success" = "1" ] ) \
&& mkdir -p /root/.cache/huggingface /root/.cache/sglang \
&& ( if [ -f python/kernels.lock ]; then mv python/kernels.lock /root/.cache/sglang/; fi ) \
&& ( find /usr/local/lib/python3.12/dist-packages -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true )
# Install pre-built gateway artifacts from parallel builder
COPY --from=gateway_builder /build/sgl-model-gateway-bin /usr/local/bin/sgl-model-gateway
COPY --from=gateway_builder /build/gateway_wheels /tmp/gateway_wheels
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --force-reinstall /tmp/gateway_wheels/*.whl \
&& rm -rf /tmp/gateway_wheels
# Set workspace directory
WORKDIR /sgl-workspace/sglang
# Keep build provenance at the end so metadata changes do not invalidate build layers.
ARG SGLANG_BUILD_COMMIT=unknown
ARG SGLANG_BUILD_URL=
ARG SGLANG_IMAGE_TAG=local/sglang:dev
ENV SGLANG_BUILD_COMMIT=${SGLANG_BUILD_COMMIT:-unknown} \
SGLANG_BUILD_URL=${SGLANG_BUILD_URL:-} \
SGLANG_IMAGE_TAG=${SGLANG_IMAGE_TAG:-local/sglang:dev}
LABEL org.opencontainers.image.source="https://github.com/sgl-project/sglang" \
org.opencontainers.image.revision="${SGLANG_BUILD_COMMIT}" \
org.opencontainers.image.version="${SGLANG_IMAGE_TAG}" \
org.opencontainers.image.url="${SGLANG_BUILD_URL}" \
ai.sglang.build.commit="${SGLANG_BUILD_COMMIT}" \
ai.sglang.build.url="${SGLANG_BUILD_URL}" \
ai.sglang.image.tag="${SGLANG_IMAGE_TAG}"
########################################################
########## Runtime Image ##############################
########################################################
#
# PURPOSE: Production runtime environment with JIT support
#
# This stage creates a production-ready image containing:
# - Pre-compiled SGLang and DeepEP components
# - Full CUDA toolchain for JIT compilation (DeepGEMM, Triton, FlashInfer)
# - Optimized for inference workloads and deployment
# - Smaller than framework (no dev tools like vim, tmux, nsight, etc.)
#
# Use this stage when you need:
# - Production deployment of SGLang
# - JIT compilation support for FP8/microscaling kernels
# - Ready-to-run inference server environment
#
# Note: Uses devel base for complete NVCC toolchain required by DeepGEMM JIT
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS runtime
ARG CUDA_VERSION
ARG TARGETARCH
ARG GDRCOPY_VERSION=2.5.1
ENV DEBIAN_FRONTEND=noninteractive \
CUDA_HOME=/usr/local/cuda \
GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/
# Add GKE default lib and bin locations + CUDA compiler paths for FlashInfer JIT
ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin" \
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
# Install runtime dependencies (devel base provides gcc/g++/build tools)
# Python 3.12 ships in Ubuntu 24.04 main, so no deadsnakes PPA needed.
RUN --mount=type=cache,target=/var/cache/apt,id=runtime-apt \
apt-get update && apt-get install -y --no-install-recommends --allow-change-held-packages \
# Python runtime
python3.12-full \
python3.12-dev \
wget \
# Core system utilities
ca-certificates \
netcat-openbsd \
curl \
git \
# Runtime libraries
libopenmpi3 \
libnuma1 \
libibverbs1 \
libibumad3 \
librdmacm1 \
libnl-3-200 \
libnl-route-3-200 \
ibverbs-providers \
libgoogle-glog0v6t64 \
libunwind8 \
libboost-system1.83.0 \
libboost-thread1.83.0 \
libboost-filesystem1.83.0 \
libgrpc++1.51t64 \
libprotobuf32t64 \
libhiredis1.1.0 \
libcurl4 \
libczmq4 \
libfabric1 \
libssl3 \
# RDMA runtime
rdma-core \
infiniband-diags \
perftest \
# Build tools for JIT compilation
ninja-build \
# NCCL packages needed for pynccl_allocator JIT compilation (-lnccl)
libnccl2 \
libnccl-dev \
# GPG key verification
gnupg2 \
linux-libc-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \
&& update-alternatives --set python3 /usr/bin/python3.12 \
&& ln -sf /usr/bin/python3.12 /usr/bin/python \
&& wget -q https://bootstrap.pypa.io/get-pip.py \
&& python3 get-pip.py --break-system-packages \
&& rm get-pip.py \
# Allow pip to install packages globally (PEP 668 workaround for Ubuntu 24.04)
&& python3 -m pip config set global.break-system-packages true \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Set up locale
RUN apt-get update && apt-get install -y --no-install-recommends locales \
&& locale-gen en_US.UTF-8 \
&& rm -rf /var/lib/apt/lists/*
ENV LANG=en_US.UTF-8 \
LANGUAGE=en_US:en \
LC_ALL=en_US.UTF-8
# Fix Trivy-reported CVEs (see framework stage for full CVE list)
RUN --mount=type=cache,target=/var/cache/apt,id=runtime-apt \
apt-get update && apt-get install -y --only-upgrade \
binutils binutils-common binutils-x86-64-linux-gnu libbinutils \
libctf0 libctf-nobfd0 libgprofng0 libsframe1 \
libgnutls30t64 \
libpam-modules libpam-modules-bin libpam-runtime libpam0g \
libsqlite3-0 libtasn1-6 \
dpkg dpkg-dev libdpkg-perl \
&& rm -rf /var/lib/apt/lists/*
# Copy Python site-packages from framework (already cleaned of __pycache__/tests/pyc files)
COPY --from=framework_final /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages
# Copy SGLang workspace
COPY --from=framework_final /sgl-workspace /sgl-workspace
# Copy sgl-model-gateway binary
COPY --from=framework_final /usr/local/bin/sgl-model-gateway /usr/local/bin/sgl-model-gateway
# Copy py-spy binary
COPY --from=framework_final /usr/local/bin/py-spy /usr/local/bin/py-spy
# Copy cache for kernels from kernels community
COPY --from=framework_final /root/.cache/huggingface /root/.cache/huggingface
COPY --from=framework_final /root/.cache/sglang /root/.cache/sglang
# Copy GDRCopy runtime libraries (but not the build artifacts)
COPY --from=framework_final /usr/lib/libgdrapi.so* /usr/lib/
COPY --from=framework_final /usr/bin/gdrcopy_* /usr/bin/
COPY --from=framework_final /usr/src/gdrdrv-2.5.1 /usr/src/gdrdrv-2.5.1
# Fix DeepEP IBGDA symlink in runtime
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
WORKDIR /sgl-workspace/sglang
# Keep build provenance at the end so metadata changes do not invalidate build layers.
ARG SGLANG_BUILD_COMMIT=unknown
ARG SGLANG_BUILD_URL=
ARG SGLANG_IMAGE_TAG=local/sglang:dev
ENV SGLANG_BUILD_COMMIT=${SGLANG_BUILD_COMMIT:-unknown} \
SGLANG_BUILD_URL=${SGLANG_BUILD_URL:-} \
SGLANG_IMAGE_TAG=${SGLANG_IMAGE_TAG:-local/sglang:dev}
LABEL org.opencontainers.image.source="https://github.com/sgl-project/sglang" \
org.opencontainers.image.revision="${SGLANG_BUILD_COMMIT}" \
org.opencontainers.image.version="${SGLANG_IMAGE_TAG}" \
org.opencontainers.image.url="${SGLANG_BUILD_URL}" \
ai.sglang.build.commit="${SGLANG_BUILD_COMMIT}" \
ai.sglang.build.url="${SGLANG_BUILD_URL}" \
ai.sglang.image.tag="${SGLANG_IMAGE_TAG}"
# Default command
CMD ["/bin/bash"]