From 3fa9d889d5ab5ceebdfdddee15c27b5d4a7a3278 Mon Sep 17 00:00:00 2001 From: andrew clark Date: Mon, 23 Mar 2026 14:56:25 -0600 Subject: [PATCH] Adding New Notification Detection (#5713) ## Motivation Restricting one of the notification failure patterns to match a specific missing drivers log pattern. This will help reduce the noise of erroneous logs. Also adding a new failure pattern to notify us of Github access issues. ## Technical Details - Set the failure pattern to match the exact failure observed in the logs. - Switching to a plain substring search so special characters are handled literally. - Added a new failure pattern for Github access errors. ## Test Plan - Force a failure using the known failure patterns. ## Test Result The forced failures were triggered and caught by the notification system. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- script/infra_helper/send_failure_notifications.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/script/infra_helper/send_failure_notifications.sh b/script/infra_helper/send_failure_notifications.sh index 11a3bb4f7d..70488bf4ae 100644 --- a/script/infra_helper/send_failure_notifications.sh +++ b/script/infra_helper/send_failure_notifications.sh @@ -22,12 +22,13 @@ PATTERNS=( 'login attempt to .* failed with status: 401 Unauthorized' 'docker login failed' 'HTTP request sent .* 404 Not Found' - 'cat: .* No such file or directory' + '/sys/module/amdgpu/version: No such file or directory' 'GPU not found' 'Could not connect to Redis at .* Connection timed out' 'unauthorized: your account must log in with a Personal Access Token' 'sccache: error: Server startup failed: Address in use' 'No space left on device' + 'Could not resolve host: github.com' ) DESCRIPTIONS=( @@ -40,10 +41,11 @@ DESCRIPTIONS=( "Docker login failed" "Sccache Error" "Device space error" + "Unable to access Github" ) # Indices into PATTERNS/DESCRIPTIONS for which a node name lookup is performed. -NODE_PATTERN_INDICES=(3 4 8) # cat: No such file, GPU not found, No space left on device +NODE_PATTERN_INDICES=(3 4 8 9) # --------------------------------------------------------------------------- # Fetch and scan the log. @@ -92,7 +94,7 @@ process_block() { if [[ "$node_idx" == "$i" ]]; then node_name=$(wget -q --no-check-certificate -O - "${BUILD_URL}consoleText" | awk ' /NODE_NAME[[:space:]]*=/ { node = $NF } - /'"$pattern"'/ { print node; exit } + index($0, "'"$pattern"'") { print node; exit } ') break fi