Adding sscache stats monitoring (#3428)

* Adding additional sccache and redis logging to each build

* Removing custom workspace

* Removing script reference

* Logging complete sccache stats

* Ensuring monitor is stopped if build fails

* Including additional sccache logging

* Removing build duration log

* Fixing groovy syntax error

* Fixing syntax

* Modifying logging statements

* Fixing syntax

* Modifying logging

* Modifying logging

* Including additional logging

* Fixing logging message

* Logging build path

* Testing

* Testing workspace path logs

* Adding additonal logging to monitor

* Modifying comments

* Adding copyright info

* Cleaning unnecessary logs

* Removing build time logs

* Merge branch 'develop' into aick-457

[ROCm/composable_kernel commit: e67cd7edeb]
This commit is contained in:
andrew clark
2025-12-17 09:15:27 -07:00
committed by GitHub
parent 509bcf1b2a
commit 23d11e9792
2 changed files with 158 additions and 2 deletions

41
Jenkinsfile vendored
View File

@@ -622,8 +622,45 @@ def cmake_build(Map conf=[:]){
echo cmd
dir("build"){
//build CK
sh cmd
// Start sccache monitoring
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
sh """
chmod +x ../script/monitor_sccache_during_build.sh
mkdir -p logs
export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
../script/monitor_sccache_during_build.sh build_monitor &
MONITOR_PID=\$!
echo "Monitor PID: \$MONITOR_PID"
echo \$MONITOR_PID > monitor.pid
"""
}
try {
//build CK
sh cmd
} catch (Exception buildError) {
echo "Build failed: ${buildError.getMessage()}"
throw buildError
} finally {
// Stop sccache monitoring
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
sh """
# Stop monitoring
if [ -f monitor.pid ]; then
MONITOR_PID=\$(cat monitor.pid)
kill \$MONITOR_PID 2>/dev/null || echo "Monitor already stopped"
rm -f monitor.pid
fi
"""
// Archive the monitoring logs
try {
archiveArtifacts artifacts: "logs/*monitor*.log", allowEmptyArchive: true
} catch (Exception e) {
echo "Could not archive sccache monitoring logs: ${e.getMessage()}"
}
}
}
//run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
sh "python3 ../script/ninja_json_converter.py .ninja_log --legacy-format --output ck_build_trace_${check_arch_name()}.json"

View File

@@ -0,0 +1,119 @@
#!/bin/bash
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
# Continuous monitoring script for sccache during builds
# Usage: ./monitor_sccache_during_build.sh [log_prefix] &
LOG_PREFIX=${1:-"sccache_monitor"}
# Include stage name in log filename if available
STAGE_SUFFIX=""
if [ -n "${STAGE_NAME}" ]; then
# Convert stage name to filename-safe format (replace spaces and special chars with underscores)
STAGE_SAFE=$(echo "${STAGE_NAME}" | sed 's/[^a-zA-Z0-9]/_/g' | sed 's/__*/_/g' | sed 's/^_\|_$//g')
STAGE_SUFFIX="_${STAGE_SAFE}"
fi
MONITOR_LOG="logs/${LOG_PREFIX}_$(date +%Y%m%d_%H%M%S)${STAGE_SUFFIX}.log"
MONITOR_INTERVAL=30 # seconds
echo "Starting sccache monitoring - logging to $MONITOR_LOG"
echo "Monitor interval: $MONITOR_INTERVAL seconds"
# Function to log with timestamp
log_with_timestamp() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$MONITOR_LOG"
}
# Function to get sccache stats safely
get_sccache_stats() {
if command -v sccache &> /dev/null; then
sccache --show-stats 2>/dev/null || echo "sccache stats unavailable"
else
echo "sccache command not found"
fi
}
# Function to check if sccache server is running
is_sccache_running() {
if command -v sccache &> /dev/null; then
sccache --show-stats &> /dev/null
return $?
else
return 1
fi
}
# Function to test Redis connectivity
test_redis_connectivity() {
# Use SCCACHE_REDIS if set, otherwise construct from CK_SCCACHE
local REDIS_URL=""
if [ -n "${SCCACHE_REDIS}" ]; then
REDIS_URL="${SCCACHE_REDIS}"
elif [ -n "${CK_SCCACHE}" ]; then
REDIS_URL="redis://${CK_SCCACHE}"
fi
if [ -n "${REDIS_URL}" ]; then
local start_time=$(date +%s%N)
local response=$(timeout 5 redis-cli -u "${REDIS_URL}" ping 2>&1) || response="TIMEOUT"
local end_time=$(date +%s%N)
local latency=$(( (end_time - start_time) / 1000000 ))
echo "Redis: $response (${latency}ms)"
else
echo "Redis: No Redis URL available"
fi
}
# Gets the last sccache stats before exiting
cleanup() {
log_with_timestamp "=== FINAL SCCACHE STATS EXIT ==="
log_with_timestamp "$(get_sccache_stats)"
echo "=== CONTINUOUS MONITORING STOPPED ==="
# List monitoring logs
echo "=== MONITORING LOGS ==="
ls -la logs/*monitor*.log 2>/dev/null || echo "No monitoring logs found"
}
trap cleanup EXIT
log_with_timestamp "=== SCCACHE MONITORING STARTED ==="
log_with_timestamp "PID: $$"
log_with_timestamp "Node: ${NODE_NAME:-$(hostname)}"
log_with_timestamp "Stage: ${STAGE_NAME:-unknown}"
log_with_timestamp "WORKSPACE_PATH: ${WORKSPACE:-not set}"
log_with_timestamp "SCCACHE_C_CUSTOM_CACHE_BUSTER: ${SCCACHE_C_CUSTOM_CACHE_BUSTER:-not set}"
log_with_timestamp "CK_SCCACHE: ${CK_SCCACHE:-not set}"
# Initial state
log_with_timestamp "=== INITIAL STATE ==="
# Reset sscache stats
sccache --zero-stats
log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
# Monitor loop
while true; do
sleep $MONITOR_INTERVAL
# Check if sccache server is still running
if ! is_sccache_running; then
log_with_timestamp "WARNING: sccache server not running!"
fi
# Get current stats
current_stats=$(get_sccache_stats)
redis_status=$(test_redis_connectivity)
# Log current cache hit information
log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
# Check for Redis latency issues
if echo "$redis_status" | grep -E "[0-9]{3,}" > /dev/null; then # >100ms latency
log_with_timestamp "HIGH REDIS LATENCY detected"
fi
# Check for Redis connection failures
if echo "$redis_status" | grep -E "(TIMEOUT|Connection refused|No route)" > /dev/null; then
log_with_timestamp "REDIS CONNECTION FAILURE detected"
fi
done