mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 11:47:48 +00:00
[rocm-libraries] ROCm/rocm-libraries#8644 (commit 8b2545e)
[CK][CI] Expand other stages to use healthy-node retry logic. (#8644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation CI stages bound to a single node via the declarative `agent{ label }` cannot recover when that node is unhealthy — a `NodeFault` (e.g. an exhausted `gitNetRetry` after a persistent `Could not resolve host: github.com`, or a missing GPU) just fails the whole build. The existing `runOnHealthyNode` wrapper already reroutes such faults to a different node, but only the "Build CK and run Tests" stages used it. This PR brings the remaining node-bound stages under that wrapper so a bad node reroutes instead of failing the build. ## Technical Details - `runOnHealthyNode`/`preflight` (`ck.groovy`): `preflight` now takes a `requireGpu` flag that gates the GPU-only checks (`driverUp`/`devicesUp`); `daemonUp`/`diskOk` always run. `runOnHealthyNode` derives it from the node label (`!label.contains('nogpu')`), so no call-site argument is needed and nogpu stages skip the GPU checks automatically. - `Jenkinsfile`: migrated 19 `agent{ label }` stages to `agent none` + `ck.runOnHealthyNode(...)`: - 12 GPU test stages: Pytorch, AITER ×2, FA ×2, 3 grouped-conv (gfx90a), TILE_ENGINE_BASIC, TILE_ENGINE_GEMM ×3. - 4 FMHA stages: the per-stage `environment{}` block was dissolved and `build_and_run_fmha(arch)` is now evaluated on-node inside the closure (required since `agent none` evaluates `environment{}` off-node). - 3 nogpu stages: Determine CI Execution, Build Docker, Static checks (reroute via the label-derived `requireGpu=false`). - The commented-out `gfx908`/`gfx1010` stages were intentionally left untouched. ## Test Plan - Validated the Jenkinsfile against the pipeline linter (`pipeline-model-converter/validate`). - Manual pipeline run to confirm no regressions - Key checks: nogpu stages pass `preflight` (no false `gpu-devices-missing`); migrated stages still allocate, build, and test as before; on a `NodeFault` the `Node attempt N/3 … on <node>` reroute lines appear. ## Test Result - Linter: **"Jenkinsfile successfully validated."** - Groovy shared-library pipeline — not locally executable; the manual build above is required to fully validate. Check CI. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
committed by
assistant-librarian[bot]
parent
2089713f94
commit
3719bf05c2
220
Jenkinsfile
vendored
220
Jenkinsfile
vendored
@@ -270,13 +270,15 @@ pipeline {
|
||||
}
|
||||
stages{
|
||||
stage("Determine CI Execution") {
|
||||
agent{ label rocmnode("nogpu") }
|
||||
agent none
|
||||
steps {
|
||||
script {
|
||||
loadCk()
|
||||
ck.checkoutComposableKernel()
|
||||
env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck())
|
||||
echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}"
|
||||
ck.runOnHealthyNode(rocmnode("nogpu")) {
|
||||
ck.checkoutComposableKernel()
|
||||
env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck())
|
||||
echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -287,14 +289,16 @@ pipeline {
|
||||
}
|
||||
parallel{
|
||||
stage('Docker /opt/rocm'){
|
||||
agent{ label rocmnode("nogpu") }
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script {
|
||||
loadCk()
|
||||
ck.buildDocker('/opt/rocm')
|
||||
ck.runOnHealthyNode(rocmnode("nogpu")) {
|
||||
deleteDir()
|
||||
ck.buildDocker('/opt/rocm')
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -304,11 +308,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { env.SHOULD_RUN_CI.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("nogpu") }
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runStaticChecks() }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("nogpu")) {
|
||||
deleteDir()
|
||||
ck.runStaticChecks()
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Run Downstream Tests")
|
||||
@@ -325,13 +334,15 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_PYTORCH_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942")}
|
||||
agent none
|
||||
steps{
|
||||
script {
|
||||
loadCk()
|
||||
ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds())
|
||||
ck.runOnHealthyNode(rocmnode("gfx942")) {
|
||||
ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds())
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run AITER Tests on gfx942")
|
||||
@@ -340,13 +351,15 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_AITER_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942")}
|
||||
agent none
|
||||
steps{
|
||||
script {
|
||||
loadCk()
|
||||
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
|
||||
ck.runOnHealthyNode(rocmnode("gfx942")) {
|
||||
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run AITER Tests on gfx950")
|
||||
@@ -355,13 +368,15 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_AITER_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx950")}
|
||||
agent none
|
||||
steps{
|
||||
script {
|
||||
loadCk()
|
||||
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
|
||||
ck.runOnHealthyNode(rocmnode("gfx950")) {
|
||||
ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds())
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run FA Tests on gfx942")
|
||||
@@ -370,13 +385,15 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_FA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942")}
|
||||
agent none
|
||||
steps{
|
||||
script {
|
||||
loadCk()
|
||||
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
|
||||
ck.runOnHealthyNode(rocmnode("gfx942")) {
|
||||
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run FA Tests on gfx950")
|
||||
@@ -385,13 +402,15 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_FA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx950")}
|
||||
agent none
|
||||
steps{
|
||||
script {
|
||||
loadCk()
|
||||
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
|
||||
ck.runOnHealthyNode(rocmnode("gfx950")) {
|
||||
ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds())
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -410,11 +429,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_FULL_CONV_TILE_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx90a")}
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runFullGroupedConvTileTests() }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("gfx90a")) {
|
||||
deleteDir()
|
||||
ck.runFullGroupedConvTileTests()
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -433,11 +457,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_GROUPED_CONV_LARGE_CASES_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx90a")}
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runGroupedConvLargeCaseTests() }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("gfx90a")) {
|
||||
deleteDir()
|
||||
ck.runGroupedConvLargeCaseTests()
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -456,11 +485,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CONV_COMPREHENSIVE_DATASET.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx90a")}
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runComprehensiveConvDatasetTests() }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("gfx90a")) {
|
||||
deleteDir()
|
||||
ck.runComprehensiveConvDatasetTests()
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -479,18 +513,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx90a") }
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = ck.build_and_run_fmha("gfx90a")
|
||||
}
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script {
|
||||
loadCk()
|
||||
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
|
||||
ck.runOnHealthyNode(rocmnode("gfx90a")) {
|
||||
deleteDir()
|
||||
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx90a"))
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run CK_TILE_FMHA Tests on gfx942")
|
||||
@@ -499,18 +531,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942") }
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = ck.build_and_run_fmha("gfx942")
|
||||
}
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script {
|
||||
loadCk()
|
||||
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
|
||||
ck.runOnHealthyNode(rocmnode("gfx942")) {
|
||||
deleteDir()
|
||||
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx942"))
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run CK_TILE_FMHA Tests on gfx950")
|
||||
@@ -519,18 +549,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx950") }
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = ck.build_and_run_fmha("gfx950")
|
||||
}
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script {
|
||||
loadCk()
|
||||
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
|
||||
ck.runOnHealthyNode(rocmnode("gfx950")) {
|
||||
deleteDir()
|
||||
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx950"))
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run CK_TILE_FMHA Tests on gfx1201")
|
||||
@@ -539,18 +567,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx1201") }
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = ck.build_and_run_fmha("gfx1201")
|
||||
}
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script {
|
||||
loadCk()
|
||||
ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
|
||||
ck.runOnHealthyNode(rocmnode("gfx1201")) {
|
||||
deleteDir()
|
||||
ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx1201"))
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -569,11 +595,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_TILE_ENGINE_BASIC_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942") }
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runTileEngineBasicTests(params.BUILD_COMPILER) }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("gfx942")) {
|
||||
deleteDir()
|
||||
ck.runTileEngineBasicTests(params.BUILD_COMPILER)
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -592,11 +623,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942") }
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER) }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("gfx942")) {
|
||||
deleteDir()
|
||||
ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER)
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Run TILE_ENGINE_GEMM Tests on gfx950")
|
||||
@@ -605,11 +641,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx950") }
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER) }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("gfx950")) {
|
||||
deleteDir()
|
||||
ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER)
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Run TILE_ENGINE_GEMM Tests on gfx1201")
|
||||
@@ -618,11 +659,16 @@ pipeline {
|
||||
beforeAgent true
|
||||
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx1201") }
|
||||
agent none
|
||||
steps{
|
||||
deleteDir()
|
||||
script { loadCk(); ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER) }
|
||||
cleanWs()
|
||||
script {
|
||||
loadCk()
|
||||
ck.runOnHealthyNode(rocmnode("gfx1201")) {
|
||||
deleteDir()
|
||||
ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER)
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -453,8 +453,12 @@ def devicesUp() {
|
||||
sh(returnStatus:true, script:'test -e /dev/kfd && ls /dev/dri/renderD* >/dev/null 2>&1') == 0
|
||||
}
|
||||
def cacheWritable() { sh(returnStatus:true, script:'D=${SCCACHE_DIR:-/.cache/sccache}; mkdir -p "$D/probe" 2>/dev/null') == 0 }
|
||||
def diskOk(String path='/var/jenkins/workspace', int minGb=5) {
|
||||
def diskOk(String path='/var/jenkins', int minGb=5) {
|
||||
echo "Preflight: checking disk space on ${path} (minimum ${minGb}GB)"
|
||||
if (sh(returnStatus:true, script:"test -d ${path}") != 0) {
|
||||
echo "Preflight: disk check path ${path} does not exist, skipping"
|
||||
return true
|
||||
}
|
||||
sh(returnStdout:true, script:"df --output=avail -BG ${path} | tail -1 | tr -dc '0-9'").trim().toInteger() >= minGb
|
||||
}
|
||||
|
||||
@@ -464,11 +468,13 @@ def gpuUsable(String image) { sh(returnStatus:true, script:"docker run --rm --de
|
||||
// Fail fast with a NodeFault if this agent is unfit to build. Host-only — no image
|
||||
// required. Image/registry/container faults are classified in the body by pullImage
|
||||
// and the in-container GPU check, where the correct conf is available.
|
||||
def preflight() {
|
||||
def preflight(boolean requireGpu) {
|
||||
echo "Preflight: starting node health checks on ${env.NODE_NAME}"
|
||||
if (!daemonUp()) throw new org.ck.NodeFault('docker-daemon-down')
|
||||
if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded')
|
||||
if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing')
|
||||
if (requireGpu) {
|
||||
if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded')
|
||||
if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing')
|
||||
}
|
||||
if (!diskOk()) throw new org.ck.NodeFault('disk-space-low')
|
||||
echo "Preflight: all checks passed on ${env.NODE_NAME}"
|
||||
// sccache cache-dir writability is not checked here: sccache runs inside
|
||||
@@ -543,7 +549,10 @@ def runOnHealthyNode(String label, Closure body) {
|
||||
node(exclude(label, excluded)) {
|
||||
attemptNode = env.NODE_NAME
|
||||
echo "Node attempt ${attempt + 1}/${nodeAttempts} on ${attemptNode}"
|
||||
preflight()
|
||||
// Derive GPU requirement from the node label: only "nogpu" stages
|
||||
// skip the driver/device checks. A new non-GPU label would need
|
||||
// adding here (otherwise preflight would wrongly demand a GPU).
|
||||
preflight(!label.contains('nogpu'))
|
||||
runInPlace(body, transientRetries)
|
||||
}
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user