diff --git a/Jenkinsfile b/Jenkinsfile index 8c1276826c..163cbcb690 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -113,71 +113,6 @@ def checkoutComposableKernel() checkout scm } -// Given a pattern, check if the log contains the pattern and return the context. -def checkForPattern(pattern, log) { - def lines = log.split('\n') - for (int i = 0; i < lines.size(); i++) { - if (lines[i] =~ pattern) { - echo "Found pattern match in log for ${pattern}" - - // Get the two lines before and after failure. - def contextStart = Math.max(0, i - 2) - def contextEnd = Math.min(lines.size() - 1, i + 2) - def contextLines = [] - for (int j = contextStart; j <= contextEnd; j++) { - contextLines.add(lines[j]) - } - - return [found: true, matchedLine: lines[i], context: contextLines.join('\n')] - } - } - echo "No pattern match found in log for ${pattern}" - return [found: false, matchedLine: "", context: ""] -} - -// Scan the build logs for failures and send notifications. -def sendFailureNotifications() { - // Error patterns to scan build logs for specific failure types and send detailed notifications. - def failurePatterns = [ - [pattern: /login attempt to .* failed with status: 401 Unauthorized/, description: "Docker registry authentication failed"], - [pattern: /.*docker login failed.*/, description: "Docker login failed"], - [pattern: /HTTP request sent .* 404 Not Found/, description: "HTTP request failed with 404"], - [pattern: /cat: .* No such file or directory/, description: "GPU not found"], - [pattern: /.*GPU not found.*/, description: "GPU not found"], - [pattern: /Could not connect to Redis at .* Connection timed out/, description: "Redis connection timed out"], - [pattern: /.*unauthorized: your account must log in with a Personal Access Token.*/, description: "Docker login failed"], - [pattern: /.*sccache: error: Server startup failed: Address in use.*/, description: "Sccache Error"] - ] - - // Get the build log. - def buildLog = sh(script: 'wget -q --no-check-certificate -O - ' + BUILD_URL + 'consoleText', returnStdout: true) - echo "Checking for failure patterns..." - // Check for patterns in the log. - // def foundPatterns = [] - // for (patternMap in failurePatterns) { - // def result = checkForPattern(patternMap.pattern, buildLog) - // if (result.found) { - // foundPatterns.add([ - // description: patternMap.description, - // matchedLine: result.matchedLine, - // context: result.context - // ]) - // } - // } - echo "Done checking for failure patterns..." - // Send a notification for each matched failure pattern. - for (patternMap in foundPatterns) { - withCredentials([string(credentialsId: 'ck_ci_errors_webhook_url', variable: 'WEBHOOK_URL')]) { - sh ''' - curl -X POST "${WEBHOOK_URL}" \ - -H 'Content-Type: application/json' \ - -d '{"text": "\\n\\n**Build Failed**\\n\\n**Issues detected:** ''' + patternMap.description + '''\\n\\n**Log context:**\\n```\\n''' + patternMap.context.replace("'", "\\'") + '''\\n```\\n\\n**Job:** ''' + env.JOB_NAME + '''\\n\\n**Build:** #''' + env.BUILD_NUMBER + '''\\n\\n**URL:** ''' + env.RUN_DISPLAY_URL + '''"}' - ''' - } - } - echo "Done failure pattern checking and notifications" -} - def generateAndArchiveBuildTraceVisualization(String buildTraceFileName) { try { checkoutComposableKernel() @@ -2141,7 +2076,10 @@ pipeline { description: 'Some checks have failed' node(rocmnode("nogpu")) { script { - sendFailureNotifications() + checkoutComposableKernel() + } + withCredentials([string(credentialsId: 'ck_ci_errors_webhook_url', variable: 'WEBHOOK_URL')]) { + sh 'bash projects/composablekernel/script/infra_helper/send_failure_notifications.sh' } } } diff --git a/script/infra_helper/send_failure_notifications.sh b/script/infra_helper/send_failure_notifications.sh new file mode 100644 index 0000000000..11a3bb4f7d --- /dev/null +++ b/script/infra_helper/send_failure_notifications.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +# send_failure_notifications.sh +# +# Scans the Jenkins build log for known infrastructure failure patterns and +# sends a Teams webhook notification for each match. +# +# Required environment variables (Jenkins provides all except WEBHOOK_URL): +# BUILD_URL - Jenkins build URL (e.g. http://host/job/foo/42/) +# JOB_NAME - Jenkins job name +# BUILD_NUMBER - Jenkins build number +# RUN_DISPLAY_URL - Jenkins Blue Ocean display URL +# WEBHOOK_URL - Teams incoming webhook URL (passed via withCredentials) + +# Do not echo commands — the grep command contains all pattern strings and +# would self-match if it appeared in the console log. +set +x + +# --------------------------------------------------------------------------- +# Failure patterns and their descriptions (parallel indexed arrays). +# --------------------------------------------------------------------------- +PATTERNS=( + 'login attempt to .* failed with status: 401 Unauthorized' + 'docker login failed' + 'HTTP request sent .* 404 Not Found' + 'cat: .* No such file or directory' + 'GPU not found' + 'Could not connect to Redis at .* Connection timed out' + 'unauthorized: your account must log in with a Personal Access Token' + 'sccache: error: Server startup failed: Address in use' + 'No space left on device' +) + +DESCRIPTIONS=( + "Docker registry authentication failed" + "Docker login failed" + "HTTP request failed with 404" + "Missing drivers" + "GPU not found" + "Redis connection timed out" + "Docker login failed" + "Sccache Error" + "Device space error" +) + +# Indices into PATTERNS/DESCRIPTIONS for which a node name lookup is performed. +NODE_PATTERN_INDICES=(3 4 8) # cat: No such file, GPU not found, No space left on device + +# --------------------------------------------------------------------------- +# Fetch and scan the log. +# --------------------------------------------------------------------------- +COMBINED_PATTERN=$(printf '%s\n' "${PATTERNS[@]}" | paste -sd '|') + +echo "Checking for failure patterns..." +GREP_OUTPUT=$(wget -q --no-check-certificate -O - "${BUILD_URL}consoleText" \ + | grep -E -B 2 -A 2 "${COMBINED_PATTERN}" || true) + +if [[ -z "$GREP_OUTPUT" ]]; then + echo "No failure patterns found in build log" + exit 0 +fi + +# --------------------------------------------------------------------------- +# Process each grep context block. +# --------------------------------------------------------------------------- +# Track descriptions already notified to avoid duplicate notifications. +declare -a NOTIFIED_DESCRIPTIONS=() + +process_block() { + local block="$1" + [[ -z "$block" ]] && return + + for i in "${!PATTERNS[@]}"; do + local pattern="${PATTERNS[$i]}" + local description="${DESCRIPTIONS[$i]}" + + # Skip if this description was already notified. + local already_notified=false + for notified in "${NOTIFIED_DESCRIPTIONS[@]:-}"; do + [[ "$notified" == "$description" ]] && already_notified=true && break + done + $already_notified && continue + + # Check if this block contains the pattern. + if echo "$block" | grep -qE "$pattern"; then + NOTIFIED_DESCRIPTIONS+=("$description") + + # For node-related patterns, find the most recent NODE_NAME before + # the failure via a single forward awk pass that exits immediately + # on the failure line, regardless of how many lines separate the two. + local node_name="" + for node_idx in "${NODE_PATTERN_INDICES[@]}"; do + if [[ "$node_idx" == "$i" ]]; then + node_name=$(wget -q --no-check-certificate -O - "${BUILD_URL}consoleText" | awk ' + /NODE_NAME[[:space:]]*=/ { node = $NF } + /'"$pattern"'/ { print node; exit } + ') + break + fi + done + + # Escape context for safe embedding in a JSON string value: + # backslashes first, then quotes, then newlines. + local escaped_context + escaped_context=$(printf '%s' "$block" \ + | sed 's/\\/\\\\/g' \ + | sed 's/"/\\"/g' \ + | sed ':a;N;$!ba;s/\n/\\n/g') + + # Build JSON payload and send notification. + echo "Sending notification for: $description" + { + printf '{\n' + printf ' "jobName": "%s",\n' "$JOB_NAME" + printf ' "buildNumber": "%s",\n' "$BUILD_NUMBER" + printf ' "jobUrl": "%s",\n' "$RUN_DISPLAY_URL" + printf ' "detectedIssue": "%s",\n' "$description" + printf ' "logContext": "%s",\n' "$escaped_context" + printf ' "nodeName": "%s"\n' "$node_name" + printf '}\n' + } > webhook_payload.json + + curl -X POST "$WEBHOOK_URL" \ + -H "Content-Type: application/json" \ + -d @webhook_payload.json + + rm -f webhook_payload.json + fi + done +} + +# grep separates non-adjacent match groups with a line containing just "--". +# Read line by line, accumulate into a block, and process when the separator +# is hit. The final block has no trailing "--" so it is processed after the loop. +current_block="" +while IFS= read -r line; do + if [[ "$line" == "--" ]]; then + process_block "$current_block" + current_block="" + else + current_block+="$line"$'\n' + fi +done <<< "$GREP_OUTPUT" +process_block "$current_block" + +echo "Done failure pattern checking and notifications"