mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Adding sscache stats monitoring (#3428)
* Adding additional sccache and redis logging to each build
* Removing custom workspace
* Removing script reference
* Logging complete sccache stats
* Ensuring monitor is stopped if build fails
* Including additional sccache logging
* Removing build duration log
* Fixing groovy syntax error
* Fixing syntax
* Modifying logging statements
* Fixing syntax
* Modifying logging
* Modifying logging
* Including additional logging
* Fixing logging message
* Logging build path
* Testing
* Testing workspace path logs
* Adding additonal logging to monitor
* Modifying comments
* Adding copyright info
* Cleaning unnecessary logs
* Removing build time logs
* Merge branch 'develop' into aick-457
[ROCm/composable_kernel commit: e67cd7edeb]
This commit is contained in:
41
Jenkinsfile
vendored
41
Jenkinsfile
vendored
@@ -622,8 +622,45 @@ def cmake_build(Map conf=[:]){
|
||||
echo cmd
|
||||
|
||||
dir("build"){
|
||||
//build CK
|
||||
sh cmd
|
||||
// Start sccache monitoring
|
||||
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
|
||||
sh """
|
||||
chmod +x ../script/monitor_sccache_during_build.sh
|
||||
mkdir -p logs
|
||||
export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
|
||||
../script/monitor_sccache_during_build.sh build_monitor &
|
||||
MONITOR_PID=\$!
|
||||
echo "Monitor PID: \$MONITOR_PID"
|
||||
echo \$MONITOR_PID > monitor.pid
|
||||
"""
|
||||
}
|
||||
try {
|
||||
//build CK
|
||||
sh cmd
|
||||
} catch (Exception buildError) {
|
||||
echo "Build failed: ${buildError.getMessage()}"
|
||||
throw buildError
|
||||
} finally {
|
||||
// Stop sccache monitoring
|
||||
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
|
||||
sh """
|
||||
# Stop monitoring
|
||||
if [ -f monitor.pid ]; then
|
||||
MONITOR_PID=\$(cat monitor.pid)
|
||||
kill \$MONITOR_PID 2>/dev/null || echo "Monitor already stopped"
|
||||
rm -f monitor.pid
|
||||
fi
|
||||
"""
|
||||
|
||||
// Archive the monitoring logs
|
||||
try {
|
||||
archiveArtifacts artifacts: "logs/*monitor*.log", allowEmptyArchive: true
|
||||
} catch (Exception e) {
|
||||
echo "Could not archive sccache monitoring logs: ${e.getMessage()}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
|
||||
if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
|
||||
sh "python3 ../script/ninja_json_converter.py .ninja_log --legacy-format --output ck_build_trace_${check_arch_name()}.json"
|
||||
|
||||
119
script/monitor_sccache_during_build.sh
Normal file
119
script/monitor_sccache_during_build.sh
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# Continuous monitoring script for sccache during builds
|
||||
# Usage: ./monitor_sccache_during_build.sh [log_prefix] &
|
||||
|
||||
LOG_PREFIX=${1:-"sccache_monitor"}
|
||||
|
||||
# Include stage name in log filename if available
|
||||
STAGE_SUFFIX=""
|
||||
if [ -n "${STAGE_NAME}" ]; then
|
||||
# Convert stage name to filename-safe format (replace spaces and special chars with underscores)
|
||||
STAGE_SAFE=$(echo "${STAGE_NAME}" | sed 's/[^a-zA-Z0-9]/_/g' | sed 's/__*/_/g' | sed 's/^_\|_$//g')
|
||||
STAGE_SUFFIX="_${STAGE_SAFE}"
|
||||
fi
|
||||
|
||||
MONITOR_LOG="logs/${LOG_PREFIX}_$(date +%Y%m%d_%H%M%S)${STAGE_SUFFIX}.log"
|
||||
MONITOR_INTERVAL=30 # seconds
|
||||
|
||||
echo "Starting sccache monitoring - logging to $MONITOR_LOG"
|
||||
echo "Monitor interval: $MONITOR_INTERVAL seconds"
|
||||
|
||||
# Function to log with timestamp
|
||||
log_with_timestamp() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$MONITOR_LOG"
|
||||
}
|
||||
|
||||
# Function to get sccache stats safely
|
||||
get_sccache_stats() {
|
||||
if command -v sccache &> /dev/null; then
|
||||
sccache --show-stats 2>/dev/null || echo "sccache stats unavailable"
|
||||
else
|
||||
echo "sccache command not found"
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to check if sccache server is running
|
||||
is_sccache_running() {
|
||||
if command -v sccache &> /dev/null; then
|
||||
sccache --show-stats &> /dev/null
|
||||
return $?
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to test Redis connectivity
|
||||
test_redis_connectivity() {
|
||||
# Use SCCACHE_REDIS if set, otherwise construct from CK_SCCACHE
|
||||
local REDIS_URL=""
|
||||
if [ -n "${SCCACHE_REDIS}" ]; then
|
||||
REDIS_URL="${SCCACHE_REDIS}"
|
||||
elif [ -n "${CK_SCCACHE}" ]; then
|
||||
REDIS_URL="redis://${CK_SCCACHE}"
|
||||
fi
|
||||
|
||||
if [ -n "${REDIS_URL}" ]; then
|
||||
local start_time=$(date +%s%N)
|
||||
local response=$(timeout 5 redis-cli -u "${REDIS_URL}" ping 2>&1) || response="TIMEOUT"
|
||||
local end_time=$(date +%s%N)
|
||||
local latency=$(( (end_time - start_time) / 1000000 ))
|
||||
echo "Redis: $response (${latency}ms)"
|
||||
else
|
||||
echo "Redis: No Redis URL available"
|
||||
fi
|
||||
}
|
||||
|
||||
# Gets the last sccache stats before exiting
|
||||
cleanup() {
|
||||
log_with_timestamp "=== FINAL SCCACHE STATS EXIT ==="
|
||||
log_with_timestamp "$(get_sccache_stats)"
|
||||
echo "=== CONTINUOUS MONITORING STOPPED ==="
|
||||
# List monitoring logs
|
||||
echo "=== MONITORING LOGS ==="
|
||||
ls -la logs/*monitor*.log 2>/dev/null || echo "No monitoring logs found"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
log_with_timestamp "=== SCCACHE MONITORING STARTED ==="
|
||||
log_with_timestamp "PID: $$"
|
||||
log_with_timestamp "Node: ${NODE_NAME:-$(hostname)}"
|
||||
log_with_timestamp "Stage: ${STAGE_NAME:-unknown}"
|
||||
log_with_timestamp "WORKSPACE_PATH: ${WORKSPACE:-not set}"
|
||||
log_with_timestamp "SCCACHE_C_CUSTOM_CACHE_BUSTER: ${SCCACHE_C_CUSTOM_CACHE_BUSTER:-not set}"
|
||||
log_with_timestamp "CK_SCCACHE: ${CK_SCCACHE:-not set}"
|
||||
|
||||
# Initial state
|
||||
log_with_timestamp "=== INITIAL STATE ==="
|
||||
# Reset sscache stats
|
||||
sccache --zero-stats
|
||||
log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
|
||||
|
||||
# Monitor loop
|
||||
while true; do
|
||||
sleep $MONITOR_INTERVAL
|
||||
|
||||
# Check if sccache server is still running
|
||||
if ! is_sccache_running; then
|
||||
log_with_timestamp "WARNING: sccache server not running!"
|
||||
fi
|
||||
|
||||
# Get current stats
|
||||
current_stats=$(get_sccache_stats)
|
||||
redis_status=$(test_redis_connectivity)
|
||||
|
||||
# Log current cache hit information
|
||||
log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
|
||||
|
||||
# Check for Redis latency issues
|
||||
if echo "$redis_status" | grep -E "[0-9]{3,}" > /dev/null; then # >100ms latency
|
||||
log_with_timestamp "HIGH REDIS LATENCY detected"
|
||||
fi
|
||||
|
||||
# Check for Redis connection failures
|
||||
if echo "$redis_status" | grep -E "(TIMEOUT|Connection refused|No route)" > /dev/null; then
|
||||
log_with_timestamp "REDIS CONNECTION FAILURE detected"
|
||||
fi
|
||||
done
|
||||
Reference in New Issue
Block a user