From 3719bf05c2dc424aa771ecd5942b7c661e9e53c8 Mon Sep 17 00:00:00 2001 From: Brock Hargreaves <253123018+brockhargreaves-amd@users.noreply.github.com> Date: Mon, 29 Jun 2026 16:03:23 +0000 Subject: [PATCH] [rocm-libraries] ROCm/rocm-libraries#8644 (commit 8b2545e) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [CK][CI] Expand other stages to use healthy-node retry logic. (#8644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation CI stages bound to a single node via the declarative `agent{ label }` cannot recover when that node is unhealthy — a `NodeFault` (e.g. an exhausted `gitNetRetry` after a persistent `Could not resolve host: github.com`, or a missing GPU) just fails the whole build. The existing `runOnHealthyNode` wrapper already reroutes such faults to a different node, but only the "Build CK and run Tests" stages used it. This PR brings the remaining node-bound stages under that wrapper so a bad node reroutes instead of failing the build. ## Technical Details - `runOnHealthyNode`/`preflight` (`ck.groovy`): `preflight` now takes a `requireGpu` flag that gates the GPU-only checks (`driverUp`/`devicesUp`); `daemonUp`/`diskOk` always run. `runOnHealthyNode` derives it from the node label (`!label.contains('nogpu')`), so no call-site argument is needed and nogpu stages skip the GPU checks automatically. - `Jenkinsfile`: migrated 19 `agent{ label }` stages to `agent none` + `ck.runOnHealthyNode(...)`: - 12 GPU test stages: Pytorch, AITER ×2, FA ×2, 3 grouped-conv (gfx90a), TILE_ENGINE_BASIC, TILE_ENGINE_GEMM ×3. - 4 FMHA stages: the per-stage `environment{}` block was dissolved and `build_and_run_fmha(arch)` is now evaluated on-node inside the closure (required since `agent none` evaluates `environment{}` off-node). - 3 nogpu stages: Determine CI Execution, Build Docker, Static checks (reroute via the label-derived `requireGpu=false`). - The commented-out `gfx908`/`gfx1010` stages were intentionally left untouched. ## Test Plan - Validated the Jenkinsfile against the pipeline linter (`pipeline-model-converter/validate`). - Manual pipeline run to confirm no regressions - Key checks: nogpu stages pass `preflight` (no false `gpu-devices-missing`); migrated stages still allocate, build, and test as before; on a `NodeFault` the `Node attempt N/3 … on ` reroute lines appear. ## Test Result - Linter: **"Jenkinsfile successfully validated."** - Groovy shared-library pipeline — not locally executable; the manual build above is required to fully validate. Check CI. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- Jenkinsfile | 220 +++++++++++++++++++++++++----------------- groovy/vars/ck.groovy | 19 +++- 2 files changed, 147 insertions(+), 92 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6915c73b0c..4b49313446 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -270,13 +270,15 @@ pipeline { } stages{ stage("Determine CI Execution") { - agent{ label rocmnode("nogpu") } + agent none steps { script { loadCk() - ck.checkoutComposableKernel() - env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck()) - echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}" + ck.runOnHealthyNode(rocmnode("nogpu")) { + ck.checkoutComposableKernel() + env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck()) + echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}" + } } } } @@ -287,14 +289,16 @@ pipeline { } parallel{ stage('Docker /opt/rocm'){ - agent{ label rocmnode("nogpu") } + agent none steps{ - deleteDir() script { loadCk() - ck.buildDocker('/opt/rocm') + ck.runOnHealthyNode(rocmnode("nogpu")) { + deleteDir() + ck.buildDocker('/opt/rocm') + cleanWs() + } } - cleanWs() } } } @@ -304,11 +308,16 @@ pipeline { beforeAgent true expression { env.SHOULD_RUN_CI.toBoolean() } } - agent{ label rocmnode("nogpu") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runStaticChecks() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("nogpu")) { + deleteDir() + ck.runStaticChecks() + cleanWs() + } + } } } stage("Run Downstream Tests") @@ -325,13 +334,15 @@ pipeline { beforeAgent true expression { params.RUN_PYTORCH_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx942")) { + ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run AITER Tests on gfx942") @@ -340,13 +351,15 @@ pipeline { beforeAgent true expression { params.RUN_AITER_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx942")) { + ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run AITER Tests on gfx950") @@ -355,13 +368,15 @@ pipeline { beforeAgent true expression { params.RUN_AITER_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx950")) { + ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run FA Tests on gfx942") @@ -370,13 +385,15 @@ pipeline { beforeAgent true expression { params.RUN_FA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx942")) { + ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run FA Tests on gfx950") @@ -385,13 +402,15 @@ pipeline { beforeAgent true expression { params.RUN_FA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx950")) { + ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + cleanWs() + } } - cleanWs() } } } @@ -410,11 +429,16 @@ pipeline { beforeAgent true expression { params.RUN_FULL_CONV_TILE_TESTS.toBoolean() } } - agent{ label rocmnode("gfx90a")} + agent none steps{ - deleteDir() - script { loadCk(); ck.runFullGroupedConvTileTests() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.runFullGroupedConvTileTests() + cleanWs() + } + } } } } @@ -433,11 +457,16 @@ pipeline { beforeAgent true expression { params.RUN_GROUPED_CONV_LARGE_CASES_TESTS.toBoolean() } } - agent{ label rocmnode("gfx90a")} + agent none steps{ - deleteDir() - script { loadCk(); ck.runGroupedConvLargeCaseTests() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.runGroupedConvLargeCaseTests() + cleanWs() + } + } } } } @@ -456,11 +485,16 @@ pipeline { beforeAgent true expression { params.RUN_CONV_COMPREHENSIVE_DATASET.toBoolean() } } - agent{ label rocmnode("gfx90a")} + agent none steps{ - deleteDir() - script { loadCk(); ck.runComprehensiveConvDatasetTests() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.runComprehensiveConvDatasetTests() + cleanWs() + } + } } } } @@ -479,18 +513,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx90a") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx90a") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx90a")) + cleanWs() + } } - cleanWs() } } stage("Run CK_TILE_FMHA Tests on gfx942") @@ -499,18 +531,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx942") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx942")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx942")) + cleanWs() + } } - cleanWs() } } stage("Run CK_TILE_FMHA Tests on gfx950") @@ -519,18 +549,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx950") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx950")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx950")) + cleanWs() + } } - cleanWs() } } stage("Run CK_TILE_FMHA Tests on gfx1201") @@ -539,18 +567,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx1201") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx1201") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx1201")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx1201")) + cleanWs() + } } - cleanWs() } } } @@ -569,11 +595,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_BASIC_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineBasicTests(params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx942")) { + deleteDir() + ck.runTileEngineBasicTests(params.BUILD_COMPILER) + cleanWs() + } + } } } } @@ -592,11 +623,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx942")) { + deleteDir() + ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER) + cleanWs() + } + } } } stage("Run TILE_ENGINE_GEMM Tests on gfx950") @@ -605,11 +641,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx950")) { + deleteDir() + ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER) + cleanWs() + } + } } } stage("Run TILE_ENGINE_GEMM Tests on gfx1201") @@ -618,11 +659,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } } - agent{ label rocmnode("gfx1201") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx1201")) { + deleteDir() + ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER) + cleanWs() + } + } } } } diff --git a/groovy/vars/ck.groovy b/groovy/vars/ck.groovy index ef1a82ee76..c21e8ab3a6 100644 --- a/groovy/vars/ck.groovy +++ b/groovy/vars/ck.groovy @@ -453,8 +453,12 @@ def devicesUp() { sh(returnStatus:true, script:'test -e /dev/kfd && ls /dev/dri/renderD* >/dev/null 2>&1') == 0 } def cacheWritable() { sh(returnStatus:true, script:'D=${SCCACHE_DIR:-/.cache/sccache}; mkdir -p "$D/probe" 2>/dev/null') == 0 } -def diskOk(String path='/var/jenkins/workspace', int minGb=5) { +def diskOk(String path='/var/jenkins', int minGb=5) { echo "Preflight: checking disk space on ${path} (minimum ${minGb}GB)" + if (sh(returnStatus:true, script:"test -d ${path}") != 0) { + echo "Preflight: disk check path ${path} does not exist, skipping" + return true + } sh(returnStdout:true, script:"df --output=avail -BG ${path} | tail -1 | tr -dc '0-9'").trim().toInteger() >= minGb } @@ -464,11 +468,13 @@ def gpuUsable(String image) { sh(returnStatus:true, script:"docker run --rm --de // Fail fast with a NodeFault if this agent is unfit to build. Host-only — no image // required. Image/registry/container faults are classified in the body by pullImage // and the in-container GPU check, where the correct conf is available. -def preflight() { +def preflight(boolean requireGpu) { echo "Preflight: starting node health checks on ${env.NODE_NAME}" if (!daemonUp()) throw new org.ck.NodeFault('docker-daemon-down') - if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded') - if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing') + if (requireGpu) { + if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded') + if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing') + } if (!diskOk()) throw new org.ck.NodeFault('disk-space-low') echo "Preflight: all checks passed on ${env.NODE_NAME}" // sccache cache-dir writability is not checked here: sccache runs inside @@ -543,7 +549,10 @@ def runOnHealthyNode(String label, Closure body) { node(exclude(label, excluded)) { attemptNode = env.NODE_NAME echo "Node attempt ${attempt + 1}/${nodeAttempts} on ${attemptNode}" - preflight() + // Derive GPU requirement from the node label: only "nogpu" stages + // skip the driver/device checks. A new non-GPU label would need + // adding here (otherwise preflight would wrongly demand a GPU). + preflight(!label.contains('nogpu')) runInPlace(body, transientRetries) } return