From 081fe18c1cf0e168921157cf8455e8a77d3ead1a Mon Sep 17 00:00:00 2001 From: Brock Hargreaves <253123018+brockhargreaves-amd@users.noreply.github.com> Date: Thu, 18 Jun 2026 21:18:27 +0000 Subject: [PATCH] [rocm-libraries] ROCm/rocm-libraries#8558 (commit ccfa08b) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [CK][CI] Retry git network ops to survive transient DNS blips (#8558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation CI builds intermittently fail on transient git DNS blips (e.g. `Could notresolve host: github.com`). These surface as an untyped `exit code 1`, which the existing node/transient-fault retry doesn't catch — so a momentary glitch fails the whole build. ## Technical Details Added `gitNetRetry(label, body)` (3 attempts, 15s backoff) and wrapped every github.com-touching git step: ref-repo clone/update, `checkout scm`, and the hipTensor clone. All are idempotent on retry. Docker pulls are left to the existing `pullImage()` path. ## Test Plan - Mapped the failing build's `git remote update` DNS error to a now-wrapped call. - Confirmed no existing code retries git host-resolution failures. ## Test Result Groovy shared-library — not locally executable; needs a pipeline run to fully validate. Check CI. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- groovy/vars/ck.groovy | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/groovy/vars/ck.groovy b/groovy/vars/ck.groovy index c58cd5f474..f828055544 100644 --- a/groovy/vars/ck.groovy +++ b/groovy/vars/ck.groovy @@ -50,6 +50,28 @@ def setGithubStatus(String context, String state, String description) { } } +// Retry a flaky git network operation a few times with backoff. Handles +// momentary DNS/connectivity blips (e.g. "Could not resolve host: github.com") +// that would otherwise fail the whole build. Wrap each network-touching git +// step (ref-repo clone/update, SCM checkout) so a transient blip retries +// instead of failing the build. If all attempts fail, the node likely can't +// reach github at all, so escalate to a NodeFault: runOnHealthyNode then +// excludes this node and reruns the stage on another one. +def gitNetRetry(String label, Closure body) { + int maxAttempts = 3 + for (int i = 1; i <= maxAttempts; i++) { + try { body(); return } + catch (e) { + if (i == maxAttempts) { + echo "${label} failed all ${maxAttempts} attempts on ${env.NODE_NAME}; treating as node fault to reroute to another node: ${e.message}" + throw new org.ck.NodeFault("${label}: ${e.message}") + } + echo "${label} failed (attempt ${i}/${maxAttempts}) on ${env.NODE_NAME}, retrying in 15s: ${e.message}" + sleep(time: 15, unit: 'SECONDS') + } + } +} + def cloneUpdateRefRepo() { def refRepoPath = "/var/jenkins/ref-repo/rocm-libraries" def lockLabel = "git ref repo lock - ${env.NODE_NAME}" @@ -67,7 +89,7 @@ def cloneUpdateRefRepo() { rm -rf ${refRepoPath} && mkdir -p ${refRepoPath} git clone --mirror https://github.com/ROCm/rocm-libraries.git ${refRepoPath} """ - sh(script: cloneCommand, label: "clone ref repo") + gitNetRetry("clone ref repo") { sh(script: cloneCommand, label: "clone ref repo") } } echo "Completed git clone, lock released" } @@ -80,7 +102,7 @@ def cloneUpdateRefRepo() { git remote prune origin git remote update """ - sh(script: fetchCommand, label: "update ref repo") + gitNetRetry("update ref repo") { sh(script: fetchCommand, label: "update ref repo") } } echo "Completed git ref repo fetch, lock released" } @@ -90,7 +112,7 @@ def checkoutComposableKernel() //update ref repo cloneUpdateRefRepo() // checkout project - def scmVars = checkout scm + gitNetRetry("checkout scm") { checkout scm } // getGitHubCommitHash reads SCMRevisionAction recorded before any local merge, // giving the true PR branch tip (pullHash) or branch HEAD (hash). // Falls back to ORIG_HEAD (pre-merge HEAD set by git merge) when SCMRevisionAction @@ -1011,9 +1033,13 @@ def buildAndTest(Map conf=[:]){ } if (params.hipTensor_test && arch == "gfx90a" ){ // build and test hipTensor on gfx90a node + gitNetRetry("checkout hipTensor") { + sh """#!/bin/bash + git sparse-checkout add projects/hiptensor + git checkout "${params.hipTensor_branch}" + """ + } sh """#!/bin/bash - git sparse-checkout add projects/hiptensor - git checkout "${params.hipTensor_branch}" cd projects/hiptensor && mkdir -p build && CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/projects/composablekernel/install" && cmake --build build -- -j &&