[rocm-libraries] ROCm/rocm-libraries#8644 (commit 8b2545e)

[CK][CI] Expand other stages to use healthy-node retry logic.
 (#8644)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Motivation

CI stages bound to a single node via the declarative `agent{ label }`
cannot recover when that node is unhealthy — a `NodeFault` (e.g. an
exhausted `gitNetRetry` after a persistent `Could not resolve host:
github.com`, or a missing GPU) just fails the whole build. The existing
`runOnHealthyNode` wrapper already reroutes such faults to a different
node, but only the "Build CK and run Tests" stages used it. This PR
brings the remaining node-bound stages under that wrapper so a bad node
reroutes instead of failing the build.

  ## Technical Details

- `runOnHealthyNode`/`preflight` (`ck.groovy`): `preflight` now takes a
`requireGpu` flag that gates the GPU-only checks
(`driverUp`/`devicesUp`); `daemonUp`/`diskOk` always
run. `runOnHealthyNode` derives it from the node label
(`!label.contains('nogpu')`), so no call-site argument is needed and
nogpu stages skip the GPU checks automatically.
- `Jenkinsfile`: migrated 19 `agent{ label }` stages to `agent none` +
`ck.runOnHealthyNode(...)`:
- 12 GPU test stages: Pytorch, AITER ×2, FA ×2, 3 grouped-conv (gfx90a),
TILE_ENGINE_BASIC, TILE_ENGINE_GEMM ×3.
- 4 FMHA stages: the per-stage `environment{}` block was dissolved and
`build_and_run_fmha(arch)` is now evaluated on-node inside the closure
(required since `agent none`
  evaluates `environment{}` off-node).
- 3 nogpu stages: Determine CI Execution, Build Docker, Static checks
(reroute via the label-derived `requireGpu=false`).
- The commented-out `gfx908`/`gfx1010` stages were intentionally left
untouched.

  ## Test Plan

- Validated the Jenkinsfile against the pipeline linter
(`pipeline-model-converter/validate`).
  - Manual pipeline run to confirm no regressions
- Key checks: nogpu stages pass `preflight` (no false
`gpu-devices-missing`); migrated stages still allocate, build, and test
as before; on a `NodeFault` the `Node attempt N/3
  … on <node>` reroute lines appear.

  ## Test Result

  - Linter: **"Jenkinsfile successfully validated."**
- Groovy shared-library pipeline — not locally executable; the manual
build above is required to fully validate. Check CI.
## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
Brock Hargreaves
2026-06-29 16:03:23 +00:00
committed by assistant-librarian[bot]
parent 2089713f94
commit 3719bf05c2
2 changed files with 147 additions and 92 deletions

220
Jenkinsfile vendored
View File

@@ -270,13 +270,15 @@ pipeline {
}
stages{
stage("Determine CI Execution") {
agent{ label rocmnode("nogpu") }
agent none
steps {
script {
loadCk()
ck.checkoutComposableKernel()
env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck())
echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}"
ck.runOnHealthyNode(rocmnode("nogpu")) {
ck.checkoutComposableKernel()
env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck())
echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}"
}
}
}
}
@@ -287,14 +289,16 @@ pipeline {
}
parallel{
stage('Docker /opt/rocm'){
agent{ label rocmnode("nogpu") }
agent none
steps{
deleteDir()
script {
loadCk()
ck.buildDocker('/opt/rocm')
ck.runOnHealthyNode(rocmnode("nogpu")) {
deleteDir()
ck.buildDocker('/opt/rocm')
cleanWs()
}
}
cleanWs()
}
}
}
@@ -304,11 +308,16 @@ pipeline {
beforeAgent true
expression { env.SHOULD_RUN_CI.toBoolean() }
}
agent{ label rocmnode("nogpu") }
agent none
steps{
deleteDir()
script { loadCk(); ck.runStaticChecks() }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("nogpu")) {
deleteDir()
ck.runStaticChecks()
cleanWs()
}
}
}
}
stage("Run Downstream Tests")
@@ -325,13 +334,15 @@ pipeline {
beforeAgent true
expression { params.RUN_PYTORCH_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942")}
agent none
steps{
script {
loadCk()
ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds())
ck.runOnHealthyNode(rocmnode("gfx942")) {
ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds())
cleanWs()
}
}
cleanWs()
}
}
stage("Run AITER Tests on gfx942")
@@ -340,13 +351,15 @@ pipeline {
beforeAgent true
expression { params.RUN_AITER_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942")}
agent none
steps{
script {
loadCk()
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
ck.runOnHealthyNode(rocmnode("gfx942")) {
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
cleanWs()
}
}
cleanWs()
}
}
stage("Run AITER Tests on gfx950")
@@ -355,13 +368,15 @@ pipeline {
beforeAgent true
expression { params.RUN_AITER_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx950")}
agent none
steps{
script {
loadCk()
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
ck.runOnHealthyNode(rocmnode("gfx950")) {
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
cleanWs()
}
}
cleanWs()
}
}
stage("Run FA Tests on gfx942")
@@ -370,13 +385,15 @@ pipeline {
beforeAgent true
expression { params.RUN_FA_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942")}
agent none
steps{
script {
loadCk()
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
ck.runOnHealthyNode(rocmnode("gfx942")) {
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
cleanWs()
}
}
cleanWs()
}
}
stage("Run FA Tests on gfx950")
@@ -385,13 +402,15 @@ pipeline {
beforeAgent true
expression { params.RUN_FA_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx950")}
agent none
steps{
script {
loadCk()
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
ck.runOnHealthyNode(rocmnode("gfx950")) {
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
cleanWs()
}
}
cleanWs()
}
}
}
@@ -410,11 +429,16 @@ pipeline {
beforeAgent true
expression { params.RUN_FULL_CONV_TILE_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
agent none
steps{
deleteDir()
script { loadCk(); ck.runFullGroupedConvTileTests() }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("gfx90a")) {
deleteDir()
ck.runFullGroupedConvTileTests()
cleanWs()
}
}
}
}
}
@@ -433,11 +457,16 @@ pipeline {
beforeAgent true
expression { params.RUN_GROUPED_CONV_LARGE_CASES_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
agent none
steps{
deleteDir()
script { loadCk(); ck.runGroupedConvLargeCaseTests() }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("gfx90a")) {
deleteDir()
ck.runGroupedConvLargeCaseTests()
cleanWs()
}
}
}
}
}
@@ -456,11 +485,16 @@ pipeline {
beforeAgent true
expression { params.RUN_CONV_COMPREHENSIVE_DATASET.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
agent none
steps{
deleteDir()
script { loadCk(); ck.runComprehensiveConvDatasetTests() }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("gfx90a")) {
deleteDir()
ck.runComprehensiveConvDatasetTests()
cleanWs()
}
}
}
}
}
@@ -479,18 +513,16 @@ pipeline {
beforeAgent true
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = ck.build_and_run_fmha("gfx90a")
}
agent none
steps{
deleteDir()
script {
loadCk()
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
ck.runOnHealthyNode(rocmnode("gfx90a")) {
deleteDir()
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx90a"))
cleanWs()
}
}
cleanWs()
}
}
stage("Run CK_TILE_FMHA Tests on gfx942")
@@ -499,18 +531,16 @@ pipeline {
beforeAgent true
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = ck.build_and_run_fmha("gfx942")
}
agent none
steps{
deleteDir()
script {
loadCk()
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
ck.runOnHealthyNode(rocmnode("gfx942")) {
deleteDir()
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx942"))
cleanWs()
}
}
cleanWs()
}
}
stage("Run CK_TILE_FMHA Tests on gfx950")
@@ -519,18 +549,16 @@ pipeline {
beforeAgent true
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx950") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = ck.build_and_run_fmha("gfx950")
}
agent none
steps{
deleteDir()
script {
loadCk()
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
ck.runOnHealthyNode(rocmnode("gfx950")) {
deleteDir()
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx950"))
cleanWs()
}
}
cleanWs()
}
}
stage("Run CK_TILE_FMHA Tests on gfx1201")
@@ -539,18 +567,16 @@ pipeline {
beforeAgent true
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx1201") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = ck.build_and_run_fmha("gfx1201")
}
agent none
steps{
deleteDir()
script {
loadCk()
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
ck.runOnHealthyNode(rocmnode("gfx1201")) {
deleteDir()
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx1201"))
cleanWs()
}
}
cleanWs()
}
}
}
@@ -569,11 +595,16 @@ pipeline {
beforeAgent true
expression { params.RUN_TILE_ENGINE_BASIC_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942") }
agent none
steps{
deleteDir()
script { loadCk(); ck.runTileEngineBasicTests(params.BUILD_COMPILER) }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("gfx942")) {
deleteDir()
ck.runTileEngineBasicTests(params.BUILD_COMPILER)
cleanWs()
}
}
}
}
}
@@ -592,11 +623,16 @@ pipeline {
beforeAgent true
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942") }
agent none
steps{
deleteDir()
script { loadCk(); ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER) }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("gfx942")) {
deleteDir()
ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER)
cleanWs()
}
}
}
}
stage("Run TILE_ENGINE_GEMM Tests on gfx950")
@@ -605,11 +641,16 @@ pipeline {
beforeAgent true
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx950") }
agent none
steps{
deleteDir()
script { loadCk(); ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER) }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("gfx950")) {
deleteDir()
ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER)
cleanWs()
}
}
}
}
stage("Run TILE_ENGINE_GEMM Tests on gfx1201")
@@ -618,11 +659,16 @@ pipeline {
beforeAgent true
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx1201") }
agent none
steps{
deleteDir()
script { loadCk(); ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER) }
cleanWs()
script {
loadCk()
ck.runOnHealthyNode(rocmnode("gfx1201")) {
deleteDir()
ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER)
cleanWs()
}
}
}
}
}

View File

@@ -453,8 +453,12 @@ def devicesUp() {
sh(returnStatus:true, script:'test -e /dev/kfd && ls /dev/dri/renderD* >/dev/null 2>&1') == 0
}
def cacheWritable() { sh(returnStatus:true, script:'D=${SCCACHE_DIR:-/.cache/sccache}; mkdir -p "$D/probe" 2>/dev/null') == 0 }
def diskOk(String path='/var/jenkins/workspace', int minGb=5) {
def diskOk(String path='/var/jenkins', int minGb=5) {
echo "Preflight: checking disk space on ${path} (minimum ${minGb}GB)"
if (sh(returnStatus:true, script:"test -d ${path}") != 0) {
echo "Preflight: disk check path ${path} does not exist, skipping"
return true
}
sh(returnStdout:true, script:"df --output=avail -BG ${path} | tail -1 | tr -dc '0-9'").trim().toInteger() >= minGb
}
@@ -464,11 +468,13 @@ def gpuUsable(String image) { sh(returnStatus:true, script:"docker run --rm --de
// Fail fast with a NodeFault if this agent is unfit to build. Host-only — no image
// required. Image/registry/container faults are classified in the body by pullImage
// and the in-container GPU check, where the correct conf is available.
def preflight() {
def preflight(boolean requireGpu) {
echo "Preflight: starting node health checks on ${env.NODE_NAME}"
if (!daemonUp()) throw new org.ck.NodeFault('docker-daemon-down')
if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded')
if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing')
if (requireGpu) {
if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded')
if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing')
}
if (!diskOk()) throw new org.ck.NodeFault('disk-space-low')
echo "Preflight: all checks passed on ${env.NODE_NAME}"
// sccache cache-dir writability is not checked here: sccache runs inside
@@ -543,7 +549,10 @@ def runOnHealthyNode(String label, Closure body) {
node(exclude(label, excluded)) {
attemptNode = env.NODE_NAME
echo "Node attempt ${attempt + 1}/${nodeAttempts} on ${attemptNode}"
preflight()
// Derive GPU requirement from the node label: only "nogpu" stages
// skip the driver/device checks. A new non-GPU label would need
// adding here (otherwise preflight would wrongly demand a GPU).
preflight(!label.contains('nogpu'))
runInPlace(body, transientRetries)
}
return