diff --git a/Jenkinsfile b/Jenkinsfile index 6915c73b0c..4b49313446 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -270,13 +270,15 @@ pipeline { } stages{ stage("Determine CI Execution") { - agent{ label rocmnode("nogpu") } + agent none steps { script { loadCk() - ck.checkoutComposableKernel() - env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck()) - echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}" + ck.runOnHealthyNode(rocmnode("nogpu")) { + ck.checkoutComposableKernel() + env.SHOULD_RUN_CI = String.valueOf(params.FORCE_CI.toBoolean() || ck.shouldRunCICheck()) + echo "SHOULD_RUN_CI: ${env.SHOULD_RUN_CI}" + } } } } @@ -287,14 +289,16 @@ pipeline { } parallel{ stage('Docker /opt/rocm'){ - agent{ label rocmnode("nogpu") } + agent none steps{ - deleteDir() script { loadCk() - ck.buildDocker('/opt/rocm') + ck.runOnHealthyNode(rocmnode("nogpu")) { + deleteDir() + ck.buildDocker('/opt/rocm') + cleanWs() + } } - cleanWs() } } } @@ -304,11 +308,16 @@ pipeline { beforeAgent true expression { env.SHOULD_RUN_CI.toBoolean() } } - agent{ label rocmnode("nogpu") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runStaticChecks() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("nogpu")) { + deleteDir() + ck.runStaticChecks() + cleanWs() + } + } } } stage("Run Downstream Tests") @@ -325,13 +334,15 @@ pipeline { beforeAgent true expression { params.RUN_PYTORCH_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx942")) { + ck.run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: ck.getPytorchTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run AITER Tests on gfx942") @@ -340,13 +351,15 @@ pipeline { beforeAgent true expression { params.RUN_AITER_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx942")) { + ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run AITER Tests on gfx950") @@ -355,13 +368,15 @@ pipeline { beforeAgent true expression { params.RUN_AITER_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx950")) { + ck.run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: ck.getAiterTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run FA Tests on gfx942") @@ -370,13 +385,15 @@ pipeline { beforeAgent true expression { params.RUN_FA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx942")) { + ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + cleanWs() + } } - cleanWs() } } stage("Run FA Tests on gfx950") @@ -385,13 +402,15 @@ pipeline { beforeAgent true expression { params.RUN_FA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950")} + agent none steps{ script { loadCk() - ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + ck.runOnHealthyNode(rocmnode("gfx950")) { + ck.run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: ck.getFaTestsCmds()) + cleanWs() + } } - cleanWs() } } } @@ -410,11 +429,16 @@ pipeline { beforeAgent true expression { params.RUN_FULL_CONV_TILE_TESTS.toBoolean() } } - agent{ label rocmnode("gfx90a")} + agent none steps{ - deleteDir() - script { loadCk(); ck.runFullGroupedConvTileTests() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.runFullGroupedConvTileTests() + cleanWs() + } + } } } } @@ -433,11 +457,16 @@ pipeline { beforeAgent true expression { params.RUN_GROUPED_CONV_LARGE_CASES_TESTS.toBoolean() } } - agent{ label rocmnode("gfx90a")} + agent none steps{ - deleteDir() - script { loadCk(); ck.runGroupedConvLargeCaseTests() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.runGroupedConvLargeCaseTests() + cleanWs() + } + } } } } @@ -456,11 +485,16 @@ pipeline { beforeAgent true expression { params.RUN_CONV_COMPREHENSIVE_DATASET.toBoolean() } } - agent{ label rocmnode("gfx90a")} + agent none steps{ - deleteDir() - script { loadCk(); ck.runComprehensiveConvDatasetTests() } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.runComprehensiveConvDatasetTests() + cleanWs() + } + } } } } @@ -479,18 +513,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx90a") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx90a") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx90a")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx90a")) + cleanWs() + } } - cleanWs() } } stage("Run CK_TILE_FMHA Tests on gfx942") @@ -499,18 +531,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx942") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx942")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx942")) + cleanWs() + } } - cleanWs() } } stage("Run CK_TILE_FMHA Tests on gfx950") @@ -519,18 +549,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx950") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx950")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx950")) + cleanWs() + } } - cleanWs() } } stage("Run CK_TILE_FMHA Tests on gfx1201") @@ -539,18 +567,16 @@ pipeline { beforeAgent true expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } - agent{ label rocmnode("gfx1201") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = ck.build_and_run_fmha("gfx1201") - } + agent none steps{ - deleteDir() script { loadCk() - ck.buildAndTest(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + ck.runOnHealthyNode(rocmnode("gfx1201")) { + deleteDir() + ck.buildAndTest(setup_args: "NO_CK_BUILD", build_type: 'Release', execute_cmd: ck.build_and_run_fmha("gfx1201")) + cleanWs() + } } - cleanWs() } } } @@ -569,11 +595,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_BASIC_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineBasicTests(params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx942")) { + deleteDir() + ck.runTileEngineBasicTests(params.BUILD_COMPILER) + cleanWs() + } + } } } } @@ -592,11 +623,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } } - agent{ label rocmnode("gfx942") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx942")) { + deleteDir() + ck.runTileEngineGemmTests("gfx942", params.BUILD_COMPILER) + cleanWs() + } + } } } stage("Run TILE_ENGINE_GEMM Tests on gfx950") @@ -605,11 +641,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } } - agent{ label rocmnode("gfx950") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx950")) { + deleteDir() + ck.runTileEngineGemmTests("gfx950", params.BUILD_COMPILER) + cleanWs() + } + } } } stage("Run TILE_ENGINE_GEMM Tests on gfx1201") @@ -618,11 +659,16 @@ pipeline { beforeAgent true expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } } - agent{ label rocmnode("gfx1201") } + agent none steps{ - deleteDir() - script { loadCk(); ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER) } - cleanWs() + script { + loadCk() + ck.runOnHealthyNode(rocmnode("gfx1201")) { + deleteDir() + ck.runTileEngineGemmTests("gfx1201", params.BUILD_COMPILER) + cleanWs() + } + } } } } diff --git a/groovy/vars/ck.groovy b/groovy/vars/ck.groovy index ef1a82ee76..c21e8ab3a6 100644 --- a/groovy/vars/ck.groovy +++ b/groovy/vars/ck.groovy @@ -453,8 +453,12 @@ def devicesUp() { sh(returnStatus:true, script:'test -e /dev/kfd && ls /dev/dri/renderD* >/dev/null 2>&1') == 0 } def cacheWritable() { sh(returnStatus:true, script:'D=${SCCACHE_DIR:-/.cache/sccache}; mkdir -p "$D/probe" 2>/dev/null') == 0 } -def diskOk(String path='/var/jenkins/workspace', int minGb=5) { +def diskOk(String path='/var/jenkins', int minGb=5) { echo "Preflight: checking disk space on ${path} (minimum ${minGb}GB)" + if (sh(returnStatus:true, script:"test -d ${path}") != 0) { + echo "Preflight: disk check path ${path} does not exist, skipping" + return true + } sh(returnStdout:true, script:"df --output=avail -BG ${path} | tail -1 | tr -dc '0-9'").trim().toInteger() >= minGb } @@ -464,11 +468,13 @@ def gpuUsable(String image) { sh(returnStatus:true, script:"docker run --rm --de // Fail fast with a NodeFault if this agent is unfit to build. Host-only — no image // required. Image/registry/container faults are classified in the body by pullImage // and the in-container GPU check, where the correct conf is available. -def preflight() { +def preflight(boolean requireGpu) { echo "Preflight: starting node health checks on ${env.NODE_NAME}" if (!daemonUp()) throw new org.ck.NodeFault('docker-daemon-down') - if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded') - if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing') + if (requireGpu) { + if (!driverUp()) throw new org.ck.NodeFault('driver-not-loaded') + if (!devicesUp()) throw new org.ck.NodeFault('gpu-devices-missing') + } if (!diskOk()) throw new org.ck.NodeFault('disk-space-low') echo "Preflight: all checks passed on ${env.NODE_NAME}" // sccache cache-dir writability is not checked here: sccache runs inside @@ -543,7 +549,10 @@ def runOnHealthyNode(String label, Closure body) { node(exclude(label, excluded)) { attemptNode = env.NODE_NAME echo "Node attempt ${attempt + 1}/${nodeAttempts} on ${attemptNode}" - preflight() + // Derive GPU requirement from the node label: only "nogpu" stages + // skip the driver/device checks. A new non-GPU label would need + // adding here (otherwise preflight would wrongly demand a GPU). + preflight(!label.contains('nogpu')) runInPlace(body, transientRetries) } return