diff --git a/groovy/vars/ck.groovy b/groovy/vars/ck.groovy index c58cd5f474..f828055544 100644 --- a/groovy/vars/ck.groovy +++ b/groovy/vars/ck.groovy @@ -50,6 +50,28 @@ def setGithubStatus(String context, String state, String description) { } } +// Retry a flaky git network operation a few times with backoff. Handles +// momentary DNS/connectivity blips (e.g. "Could not resolve host: github.com") +// that would otherwise fail the whole build. Wrap each network-touching git +// step (ref-repo clone/update, SCM checkout) so a transient blip retries +// instead of failing the build. If all attempts fail, the node likely can't +// reach github at all, so escalate to a NodeFault: runOnHealthyNode then +// excludes this node and reruns the stage on another one. +def gitNetRetry(String label, Closure body) { + int maxAttempts = 3 + for (int i = 1; i <= maxAttempts; i++) { + try { body(); return } + catch (e) { + if (i == maxAttempts) { + echo "${label} failed all ${maxAttempts} attempts on ${env.NODE_NAME}; treating as node fault to reroute to another node: ${e.message}" + throw new org.ck.NodeFault("${label}: ${e.message}") + } + echo "${label} failed (attempt ${i}/${maxAttempts}) on ${env.NODE_NAME}, retrying in 15s: ${e.message}" + sleep(time: 15, unit: 'SECONDS') + } + } +} + def cloneUpdateRefRepo() { def refRepoPath = "/var/jenkins/ref-repo/rocm-libraries" def lockLabel = "git ref repo lock - ${env.NODE_NAME}" @@ -67,7 +89,7 @@ def cloneUpdateRefRepo() { rm -rf ${refRepoPath} && mkdir -p ${refRepoPath} git clone --mirror https://github.com/ROCm/rocm-libraries.git ${refRepoPath} """ - sh(script: cloneCommand, label: "clone ref repo") + gitNetRetry("clone ref repo") { sh(script: cloneCommand, label: "clone ref repo") } } echo "Completed git clone, lock released" } @@ -80,7 +102,7 @@ def cloneUpdateRefRepo() { git remote prune origin git remote update """ - sh(script: fetchCommand, label: "update ref repo") + gitNetRetry("update ref repo") { sh(script: fetchCommand, label: "update ref repo") } } echo "Completed git ref repo fetch, lock released" } @@ -90,7 +112,7 @@ def checkoutComposableKernel() //update ref repo cloneUpdateRefRepo() // checkout project - def scmVars = checkout scm + gitNetRetry("checkout scm") { checkout scm } // getGitHubCommitHash reads SCMRevisionAction recorded before any local merge, // giving the true PR branch tip (pullHash) or branch HEAD (hash). // Falls back to ORIG_HEAD (pre-merge HEAD set by git merge) when SCMRevisionAction @@ -1011,9 +1033,13 @@ def buildAndTest(Map conf=[:]){ } if (params.hipTensor_test && arch == "gfx90a" ){ // build and test hipTensor on gfx90a node + gitNetRetry("checkout hipTensor") { + sh """#!/bin/bash + git sparse-checkout add projects/hiptensor + git checkout "${params.hipTensor_branch}" + """ + } sh """#!/bin/bash - git sparse-checkout add projects/hiptensor - git checkout "${params.hipTensor_branch}" cd projects/hiptensor && mkdir -p build && CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/projects/composablekernel/install" && cmake --build build -- -j &&