mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 20:27:42 +00:00
[rocm-libraries] ROCm/rocm-libraries#8558 (commit ccfa08b)
[CK][CI] Retry git network ops to survive transient DNS blips (#8558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation CI builds intermittently fail on transient git DNS blips (e.g. `Could notresolve host: github.com`). These surface as an untyped `exit code 1`, which the existing node/transient-fault retry doesn't catch — so a momentary glitch fails the whole build. ## Technical Details Added `gitNetRetry(label, body)` (3 attempts, 15s backoff) and wrapped every github.com-touching git step: ref-repo clone/update, `checkout scm`, and the hipTensor clone. All are idempotent on retry. Docker pulls are left to the existing `pullImage()` path. ## Test Plan - Mapped the failing build's `git remote update` DNS error to a now-wrapped call. - Confirmed no existing code retries git host-resolution failures. ## Test Result Groovy shared-library — not locally executable; needs a pipeline run to fully validate. Check CI. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
committed by
assistant-librarian[bot]
parent
8864dcc3a4
commit
081fe18c1c
@@ -50,6 +50,28 @@ def setGithubStatus(String context, String state, String description) {
|
||||
}
|
||||
}
|
||||
|
||||
// Retry a flaky git network operation a few times with backoff. Handles
|
||||
// momentary DNS/connectivity blips (e.g. "Could not resolve host: github.com")
|
||||
// that would otherwise fail the whole build. Wrap each network-touching git
|
||||
// step (ref-repo clone/update, SCM checkout) so a transient blip retries
|
||||
// instead of failing the build. If all attempts fail, the node likely can't
|
||||
// reach github at all, so escalate to a NodeFault: runOnHealthyNode then
|
||||
// excludes this node and reruns the stage on another one.
|
||||
def gitNetRetry(String label, Closure body) {
|
||||
int maxAttempts = 3
|
||||
for (int i = 1; i <= maxAttempts; i++) {
|
||||
try { body(); return }
|
||||
catch (e) {
|
||||
if (i == maxAttempts) {
|
||||
echo "${label} failed all ${maxAttempts} attempts on ${env.NODE_NAME}; treating as node fault to reroute to another node: ${e.message}"
|
||||
throw new org.ck.NodeFault("${label}: ${e.message}")
|
||||
}
|
||||
echo "${label} failed (attempt ${i}/${maxAttempts}) on ${env.NODE_NAME}, retrying in 15s: ${e.message}"
|
||||
sleep(time: 15, unit: 'SECONDS')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def cloneUpdateRefRepo() {
|
||||
def refRepoPath = "/var/jenkins/ref-repo/rocm-libraries"
|
||||
def lockLabel = "git ref repo lock - ${env.NODE_NAME}"
|
||||
@@ -67,7 +89,7 @@ def cloneUpdateRefRepo() {
|
||||
rm -rf ${refRepoPath} && mkdir -p ${refRepoPath}
|
||||
git clone --mirror https://github.com/ROCm/rocm-libraries.git ${refRepoPath}
|
||||
"""
|
||||
sh(script: cloneCommand, label: "clone ref repo")
|
||||
gitNetRetry("clone ref repo") { sh(script: cloneCommand, label: "clone ref repo") }
|
||||
}
|
||||
echo "Completed git clone, lock released"
|
||||
}
|
||||
@@ -80,7 +102,7 @@ def cloneUpdateRefRepo() {
|
||||
git remote prune origin
|
||||
git remote update
|
||||
"""
|
||||
sh(script: fetchCommand, label: "update ref repo")
|
||||
gitNetRetry("update ref repo") { sh(script: fetchCommand, label: "update ref repo") }
|
||||
}
|
||||
echo "Completed git ref repo fetch, lock released"
|
||||
}
|
||||
@@ -90,7 +112,7 @@ def checkoutComposableKernel()
|
||||
//update ref repo
|
||||
cloneUpdateRefRepo()
|
||||
// checkout project
|
||||
def scmVars = checkout scm
|
||||
gitNetRetry("checkout scm") { checkout scm }
|
||||
// getGitHubCommitHash reads SCMRevisionAction recorded before any local merge,
|
||||
// giving the true PR branch tip (pullHash) or branch HEAD (hash).
|
||||
// Falls back to ORIG_HEAD (pre-merge HEAD set by git merge) when SCMRevisionAction
|
||||
@@ -1011,9 +1033,13 @@ def buildAndTest(Map conf=[:]){
|
||||
}
|
||||
if (params.hipTensor_test && arch == "gfx90a" ){
|
||||
// build and test hipTensor on gfx90a node
|
||||
gitNetRetry("checkout hipTensor") {
|
||||
sh """#!/bin/bash
|
||||
git sparse-checkout add projects/hiptensor
|
||||
git checkout "${params.hipTensor_branch}"
|
||||
"""
|
||||
}
|
||||
sh """#!/bin/bash
|
||||
git sparse-checkout add projects/hiptensor
|
||||
git checkout "${params.hipTensor_branch}"
|
||||
cd projects/hiptensor && mkdir -p build &&
|
||||
CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/projects/composablekernel/install" &&
|
||||
cmake --build build -- -j &&
|
||||
|
||||
Reference in New Issue
Block a user