Files
sglang/.github/workflows/pr-test-sgl-kernel.yml
Baizhou Zhang 6ecd6f84db [CI] Add per-job uv venv isolation and upgrade CI version to Cuda 13 (#23119)
Co-authored-by: Kangyan Zhou <zky314343421@gmail.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Alison Shao <a.shao@wustl.edu>
Co-authored-by: Mick <mickjagger19@icloud.com>
2026-04-19 05:32:36 -07:00

215 lines
6.0 KiB
YAML

name: PR Test - SGL Kernel
on:
workflow_call:
inputs:
sgl_kernel:
required: true
type: string
b200_runner:
required: true
type: string
pr_head_sha:
required: false
type: string
default: ''
git_ref:
required: false
type: string
default: ''
skip_stage_health_check:
required: false
type: boolean
default: false
# Workflow-level env is NOT inherited from the caller in reusable workflows.
# The github context (including github.event_name) IS inherited from the caller.
env:
SGLANG_IS_IN_CI: true
SGLANG_CUDA_COREDUMP: "1"
SGLANG_PR_TEST_BYPASS_MAINTENANCE_ON_MAIN: ${{ github.ref == 'refs/heads/main' && 'true' || 'false' }}
SKIP_STAGE_HEALTH_CHECK: ${{ inputs.skip_stage_health_check == true && 'true' || 'false' }}
jobs:
sgl-kernel-unit-test:
runs-on: 1-gpu-h100
timeout-minutes: 240
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.git_ref || github.sha }}
- uses: ./.github/actions/check-stage-health
- uses: ./.github/actions/check-maintenance
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda13.0
- name: Install dependencies
timeout-minutes: 20
run: |
CUSTOM_BUILD_SGL_KERNEL=${{inputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion
- name: Run test
timeout-minutes: 30
run: |
cd sgl-kernel
pytest tests/
sgl-kernel-mla-test:
runs-on: 1-gpu-h100
timeout-minutes: 240
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.git_ref || github.sha }}
- uses: ./.github/actions/check-stage-health
- uses: ./.github/actions/check-maintenance
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda13.0
- name: Install dependencies
timeout-minutes: 20
run: |
CUSTOM_BUILD_SGL_KERNEL=${{inputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/registered/mla
python3 test_mla_deepseek_v3.py
sgl-kernel-benchmark-test:
runs-on: 1-gpu-h100
timeout-minutes: 240
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.git_ref || github.sha }}
- uses: ./.github/actions/check-stage-health
- uses: ./.github/actions/check-maintenance
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda13.0
- name: Install dependencies
timeout-minutes: 20
run: |
CUSTOM_BUILD_SGL_KERNEL=${{inputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run benchmark tests
timeout-minutes: 45
run: |
cd sgl-kernel/benchmark
echo "Running sgl-kernel benchmark tests in CI mode..."
echo "CI environment variable: $CI"
echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS"
for bench_file in bench_*.py; do
echo "Testing $bench_file..."
timeout 60 python3 "$bench_file" || echo "Warning: $bench_file timed out or failed, continuing..."
echo "Completed $bench_file"
echo "---"
done
echo "All benchmark tests completed!"
sgl-kernel-b200-test:
runs-on: ${{ inputs.b200_runner }}
timeout-minutes: 240
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.git_ref || github.sha }}
- uses: ./.github/actions/check-stage-health
- uses: ./.github/actions/check-maintenance
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda13.0
- name: Install dependencies
timeout-minutes: 20
run: |
CUSTOM_BUILD_SGL_KERNEL=${{inputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion
- name: Run sgl-kernel unit tests on B200
timeout-minutes: 30
run: |
cd sgl-kernel
pytest tests/
# Adding a single CUDA13 smoke test to verify that the kernel builds and runs
# TODO: Add back this test when it can pass on CI
# cuda13-kernel-smoke-test:
# if: inputs.sgl_kernel == 'true'
# runs-on: x64-cu13-kernel-tests
# steps:
# - uses: actions/checkout@v4
# - name: Cleanup
# run: |
# ls -alh sgl-kernel/dist || true
# rm -rf sgl-kernel/dist/* || true
# - name: Download CUDA 13.0 artifacts
# uses: actions/download-artifact@v4
# with:
# path: sgl-kernel/dist/
# merge-multiple: true
# pattern: wheel-python3.10-cuda13.0
# - name: Install dependencies
# run: |
# CUSTOM_BUILD_SGL_KERNEL=${{inputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
# - name: Run kernel unit tests
# timeout-minutes: 30
# run: |
# cd sgl-kernel
# pytest tests/