sglang/sgl-kernel/build.sh

#!/bin/bash
set -ex

if [ $# -lt 2 ]; then
  echo "Usage: $0 <PYTHON_VERSION> <CUDA_VERSION> [ARCH]"
  exit 1
fi

PYTHON_VERSION="$1"          # e.g. 3.10
CUDA_VERSION="$2"            # e.g. 12.9
ARCH="${3:-$(uname -i)}"     # optional override

if [ "${ARCH}" = "aarch64" ]; then
  BASE_IMG="pytorch/manylinuxaarch64-builder"
else
  BASE_IMG="pytorch/manylinux2_28-builder"
fi

# Create cache directories for persistent build artifacts in home directory
# Using home directory to persist across workspace cleanups/checkouts
CACHE_DIR="${HOME}/.cache/sgl-kernel"
BUILDX_CACHE_DIR="${CACHE_DIR}/buildx"
CCACHE_HOST_DIR="${CACHE_DIR}/ccache"
mkdir -p "${BUILDX_CACHE_DIR}" "${CCACHE_HOST_DIR}"

# Ensure a buildx builder with docker-container driver (required for cache export)
BUILDER_NAME="sgl-kernel-builder"
# RESET_BUILDER=1 removes and recreates the builder to clear corrupted internal
# state (e.g. stale containerd snapshots from base image layer GC).
if [ "${RESET_BUILDER:-0}" = "1" ]; then
  echo "Resetting buildx builder: ${BUILDER_NAME}"
  docker buildx rm "${BUILDER_NAME}" 2>/dev/null || true
  rm -rf "${BUILDX_CACHE_DIR}"
  mkdir -p "${BUILDX_CACHE_DIR}"
fi
if ! docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
  docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use --bootstrap
else
  docker buildx use "${BUILDER_NAME}"
fi

PY_TAG="cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}"

# Output directory for wheels
DIST_DIR="dist"
mkdir -p "${DIST_DIR}"

echo "----------------------------------------"
echo "Build configuration"
echo "PYTHON_VERSION: ${PYTHON_VERSION}"
echo "CUDA_VERSION:   ${CUDA_VERSION}"
echo "ARCH:           ${ARCH}"
echo "BASE_IMG:       ${BASE_IMG}"
echo "PYTHON_TAG:     ${PY_TAG}"
echo "Output:         ${DIST_DIR}/"
echo "Buildx cache:   ${BUILDX_CACHE_DIR}"
echo "ccache dir:     ${CCACHE_HOST_DIR}"
echo "Builder:        ${BUILDER_NAME}"
echo "BUILD_JOBS:     ${BUILD_JOBS:-auto}"
echo "NVCC_THREADS:   ${NVCC_THREADS:-32}"
echo "USE_CCACHE:     ${USE_CCACHE:-1}"
echo "RESET_BUILDER:  ${RESET_BUILDER:-0}"
echo "----------------------------------------"

# Optional build-args (empty string disables)
BUILD_ARGS=()
[ -n "${ENABLE_CMAKE_PROFILE:-}" ] && BUILD_ARGS+=(--build-arg ENABLE_CMAKE_PROFILE="${ENABLE_CMAKE_PROFILE}")
[ -n "${ENABLE_BUILD_PROFILE:-}" ] && BUILD_ARGS+=(--build-arg ENABLE_BUILD_PROFILE="${ENABLE_BUILD_PROFILE}")
[ -n "${USE_CCACHE:-}" ]           && BUILD_ARGS+=(--build-arg USE_CCACHE="${USE_CCACHE}")
[ -n "${BUILD_JOBS:-}" ]           && BUILD_ARGS+=(--build-arg BUILD_JOBS="${BUILD_JOBS}")
[ -n "${NVCC_THREADS:-}" ]         && BUILD_ARGS+=(--build-arg NVCC_THREADS="${NVCC_THREADS}")

# ---- Step 1: Build deps image (layer cached, fast on repeat) ----
DEPS_TAG="sgl-kernel-deps:cuda${CUDA_VERSION}-${PY_TAG}-${ARCH}"

docker buildx build \
  --builder "${BUILDER_NAME}" \
  -f Dockerfile . \
  --build-arg BASE_IMG="${BASE_IMG}" \
  --build-arg CUDA_VERSION="${CUDA_VERSION}" \
  --build-arg ARCH="${ARCH}" \
  --build-arg PYTHON_VERSION="${PYTHON_VERSION}" \
  --build-arg PYTHON_TAG="${PY_TAG}" \
  "${BUILD_ARGS[@]}" \
  --cache-from "type=local,src=${BUILDX_CACHE_DIR}" \
  --cache-to "type=local,dest=${BUILDX_CACHE_DIR},mode=max" \
  --target deps \
  --load \
  -t "${DEPS_TAG}" \
  --network=host

echo "Deps image ready: ${DEPS_TAG}"

# ---- Step 2: Build wheel with host-mounted ccache ----
# This allows ccache to persist on the host filesystem across builds.
CCACHE_FLAG="${USE_CCACHE:-1}"
BUILD_JOBS_FLAG="${BUILD_JOBS:-0}"
NVCC_THREADS_FLAG="${NVCC_THREADS:-32}"

docker run --rm \
  --network=host \
  -v "$(pwd):/sgl-kernel" \
  -v "${CCACHE_HOST_DIR}:/ccache" \
  -w /sgl-kernel \
  -e ARCH="${ARCH}" \
  "${DEPS_TAG}" \
  bash -c '
set -eux

USE_CCACHE='"${CCACHE_FLAG}"'
BUILD_JOBS='"${BUILD_JOBS_FLAG}"'
NVCC_THREADS='"${NVCC_THREADS_FLAG}"'

if [ "${USE_CCACHE}" = "1" ]; then
  export CCACHE_DIR=/ccache
  export CCACHE_BASEDIR=/sgl-kernel
  export CCACHE_MAXSIZE=10G
  export CCACHE_COMPILERCHECK=content
  export CCACHE_COMPRESS=true
  export CCACHE_SLOPPINESS=file_macro,time_macros,include_file_mtime,include_file_ctime
  export CMAKE_C_COMPILER_LAUNCHER=ccache
  export CMAKE_CXX_COMPILER_LAUNCHER=ccache
  export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
  echo "=== ccache stats (before) ==="
  ccache -sV
fi

if [ "'"${ARCH}"'" = "aarch64" ]; then
  export CUDA_NVCC_FLAGS="-Xcudafe --threads=8"
  export MAKEFLAGS="-j8"
  export CMAKE_BUILD_PARALLEL_LEVEL=2
  export NINJAFLAGS="-j4"
  echo "ARM detected: Using extra conservative settings (2 parallel jobs)"
elif [ "${BUILD_JOBS}" -gt 0 ] 2>/dev/null; then
  export CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
else
  export CMAKE_BUILD_PARALLEL_LEVEL=$(echo "$(( $(nproc) * 2 / 3 )) 64" | awk "{print (\$1 < \$2) ? \$1 : \$2}")
fi

export CMAKE_ARGS="${CMAKE_ARGS:-} -DSGL_KERNEL_COMPILE_THREADS=${NVCC_THREADS}"
echo "Build parallelism: CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}, NVCC_THREADS=${NVCC_THREADS}"

${PYTHON_ROOT_PATH}/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always --no-build-isolation
PYTHON=${PYTHON_ROOT_PATH}/bin/python ./rename_wheels.sh

if [ "${USE_CCACHE}" = "1" ]; then
  echo "=== ccache stats (after) ==="
  ccache -s
fi
'

echo "Done. Wheels are in ${DIST_DIR}/"
ls -lh "${DIST_DIR}"/*.whl 2>/dev/null || true