mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
Merge commit '92653168c2b276d4467320f5bdff5ec6cbddf4e6' into develop
This commit is contained in:
25
CHANGELOG.md
25
CHANGELOG.md
@@ -17,21 +17,22 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
|
||||
## Composable Kernel 1.2.0 for ROCm 7.2.0
|
||||
|
||||
### Added
|
||||
* Added support for fp8 dynamic tensor-wise quantization of fp8 fmha fwd kernel.
|
||||
* Added support for bf16 data type to grouped_gemm and grouped_gemm_preshuffle.
|
||||
* Added Col-Col-Row-Col layout support for aquant mode in blockscale GEMM.
|
||||
* Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM
|
||||
* Added a compute async pipeline in the CK TILE universal GEMM on gfx950
|
||||
* Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
|
||||
* Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM.
|
||||
* Added a compute async pipeline in the CK Tile universal GEMM on gfx950.
|
||||
* Added support for B Tensor type `pk_int4_t` in the CK Tile weight preshuffle GEMM.
|
||||
* Added the new api to load different memory sizes to SGPR.
|
||||
* Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
|
||||
* Added support for B Tensor preshuffle in CK Tile grouped GEMM.
|
||||
* Added a basic copy kernel example and supporting documentation for new CK Tile developers.
|
||||
* Added support for grouped_gemm kernels to perform multi_d elementwise operation.
|
||||
* Added support for Multiple ABD GEMM
|
||||
* Added support for grouped GEMM kernels to perform Multi D elementwise operation.
|
||||
* Added support for multiple ABD GEMM.
|
||||
* Added benchmarking support for tile engine GEMM Multi D.
|
||||
* Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
|
||||
* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
|
||||
* Added support for f32 to FMHA (fwd/bwd).
|
||||
* Added tensor-wise quantization for CK_TILE GEMM.
|
||||
* Added block scaling support in CK Tile GEMM, allowing flexible use of quantization matrices from either A or B operands.
|
||||
* Added the row-wise column-wise quantization for CK Tile GEMM and CK Tile grouped GEMM.
|
||||
* Added support for f32 to FMHA (forward and backward).
|
||||
* Added tensor-wise quantization for CK Tile GEMM.
|
||||
* Added support for batched contraction kernel.
|
||||
* Added WMMA (gfx12) support for FMHA.
|
||||
* Added pooling kernel in CK_TILE
|
||||
@@ -41,7 +42,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
|
||||
|
||||
### Changed
|
||||
|
||||
* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK_TILE (#2594)
|
||||
* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK Tile (#2594)
|
||||
* Added an optional template parameter `Arch` (`gfx9_t`, `gfx12_t` etc.) to `make_kernel` to support linking multiple object files that have the same kernel compiled for different architectures.
|
||||
* FMHA examples and tests can be built for multiple architectures (gfx9, gfx950, gfx12) at the same time.
|
||||
|
||||
@@ -91,7 +92,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
|
||||
### Optimized
|
||||
|
||||
* Optimize the gemm multiply multiply preshuffle & lds bypass with Pack of KGroup and better instruction layout.
|
||||
* Added Vectorize Transpose optimization for CK Tile
|
||||
* Added Vectorize Transpose optimization for CK Tile
|
||||
* Added the asynchronous copy for gfx950
|
||||
|
||||
### Changed
|
||||
|
||||
41
Jenkinsfile
vendored
41
Jenkinsfile
vendored
@@ -622,8 +622,45 @@ def cmake_build(Map conf=[:]){
|
||||
echo cmd
|
||||
|
||||
dir("build"){
|
||||
//build CK
|
||||
sh cmd
|
||||
// Start sccache monitoring
|
||||
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
|
||||
sh """
|
||||
chmod +x ../script/monitor_sccache_during_build.sh
|
||||
mkdir -p logs
|
||||
export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
|
||||
../script/monitor_sccache_during_build.sh build_monitor &
|
||||
MONITOR_PID=\$!
|
||||
echo "Monitor PID: \$MONITOR_PID"
|
||||
echo \$MONITOR_PID > monitor.pid
|
||||
"""
|
||||
}
|
||||
try {
|
||||
//build CK
|
||||
sh cmd
|
||||
} catch (Exception buildError) {
|
||||
echo "Build failed: ${buildError.getMessage()}"
|
||||
throw buildError
|
||||
} finally {
|
||||
// Stop sccache monitoring
|
||||
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
|
||||
sh """
|
||||
# Stop monitoring
|
||||
if [ -f monitor.pid ]; then
|
||||
MONITOR_PID=\$(cat monitor.pid)
|
||||
kill \$MONITOR_PID 2>/dev/null || echo "Monitor already stopped"
|
||||
rm -f monitor.pid
|
||||
fi
|
||||
"""
|
||||
|
||||
// Archive the monitoring logs
|
||||
try {
|
||||
archiveArtifacts artifacts: "logs/*monitor*.log", allowEmptyArchive: true
|
||||
} catch (Exception e) {
|
||||
echo "Could not archive sccache monitoring logs: ${e.getMessage()}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
|
||||
if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
|
||||
sh "python3 ../script/ninja_json_converter.py .ninja_log --legacy-format --output ck_build_trace_${check_arch_name()}.json"
|
||||
|
||||
@@ -15,6 +15,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
|
||||
find_package(ROCM)
|
||||
include(ROCMInstallTargets)
|
||||
include(ROCMTest)
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm $ENV{ROCM_PATH})
|
||||
find_package(hiprtc REQUIRED)
|
||||
|
||||
rocm_setup_version(VERSION 1.0)
|
||||
|
||||
@@ -47,7 +47,7 @@ set(FMHA_FWD_CODE_GEN_COMMON_ARGS
|
||||
${CMAKE_CURRENT_LIST_DIR}/generate.py
|
||||
--targets ${FMHA_TARGETS_ARG}
|
||||
--api ${FMHA_FWD_APIS}
|
||||
--optdim 32,64,128,256
|
||||
--optdim 32,64,80,128,256
|
||||
# --filter fmha_fwd...
|
||||
)
|
||||
set(FMHA_BWD_CODE_GEN_COMMON_ARGS
|
||||
|
||||
@@ -40,7 +40,16 @@ DTYPE_BITS = {
|
||||
"bf8": 8,
|
||||
}
|
||||
|
||||
K0_MAX_SUBMAX_MAP = {32: 32, 48: 48, 64: 64, 96: 128, 128: 128, 192: 192, 256: 256}
|
||||
K0_MAX_SUBMAX_MAP = {
|
||||
32: 32,
|
||||
48: 48,
|
||||
64: 64,
|
||||
80: 96,
|
||||
96: 128,
|
||||
128: 128,
|
||||
192: 192,
|
||||
256: 256,
|
||||
}
|
||||
|
||||
FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.\n
|
||||
@@ -930,6 +939,7 @@ class KernelComponentFactoryGfx9(CompatibilityRuleFactoryGfx9):
|
||||
( 64, 64) : [FmhaFwdTileSize( 16, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1),
|
||||
FmhaFwdTileSize( 32, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1),
|
||||
FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)],
|
||||
( 80, 96) : [FmhaFwdTileSize(128, 128, 16, 96, 32, 80, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)],
|
||||
( 96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)],
|
||||
(128, 128) : [FmhaFwdTileSize( 16, 32, 64, 128, 32, 128, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1),
|
||||
FmhaFwdTileSize( 32, 32, 128, 128, 32, 128, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1),
|
||||
@@ -1014,8 +1024,12 @@ class KernelComponentFactoryGfx9(CompatibilityRuleFactoryGfx9):
|
||||
["no"],
|
||||
["f", "t"],
|
||||
):
|
||||
pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink)) # fmt: skip
|
||||
pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink)) # fmt: skip
|
||||
if hdim == 64:
|
||||
pipelines.append(FmhaFwdPipeline("qr", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink)) # fmt: skip
|
||||
pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink)) # fmt: skip
|
||||
else:
|
||||
pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink)) # fmt: skip
|
||||
pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink)) # fmt: skip
|
||||
elif dtype in ["fp8", "fp8fp16", "bf8"]:
|
||||
# TODO
|
||||
pass
|
||||
|
||||
@@ -1121,6 +1121,20 @@ llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
|
||||
index_t soffset,
|
||||
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
|
||||
|
||||
// dwordx3 - use union to convert between int32x3 and fp16/bf16 types
|
||||
union dwordx3_union
|
||||
{
|
||||
int32_t as_i32[3];
|
||||
fp16_t as_fp16[6];
|
||||
bf16_t as_bf16[6];
|
||||
};
|
||||
|
||||
CK_TILE_DEVICE_EXTERN int32x3_t
|
||||
llvm_amdgcn_raw_buffer_load_i32x3(int32x4_t srsrc,
|
||||
index_t voffset,
|
||||
index_t soffset,
|
||||
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v3i32");
|
||||
|
||||
CK_TILE_DEVICE_EXTERN int32x4_t
|
||||
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
|
||||
index_t voffset,
|
||||
@@ -1540,9 +1554,9 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
|
||||
(std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
|
||||
(std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
||||
(std::is_same<T, fp16_t>::value &&
|
||||
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
|
||||
(N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
|
||||
(std::is_same<T, bf16_t>::value &&
|
||||
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
|
||||
(N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
|
||||
(std::is_same<T, int32_t>::value &&
|
||||
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
||||
(std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
||||
@@ -1659,6 +1673,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence)));
|
||||
}
|
||||
else if constexpr(N == 6)
|
||||
{
|
||||
// N = 6: load as dwordx3 (12 bytes = 6 fp16), using buffer_load_dwordx3 instruction
|
||||
int32x3_t tmp_i32x3 =
|
||||
llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
|
||||
src_thread_addr_offset,
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence));
|
||||
|
||||
// Use union to reinterpret int32x3 as fp16x6
|
||||
dwordx3_union tmp_union;
|
||||
tmp_union.as_i32[0] = tmp_i32x3[0];
|
||||
tmp_union.as_i32[1] = tmp_i32x3[1];
|
||||
tmp_union.as_i32[2] = tmp_i32x3[2];
|
||||
|
||||
thread_buffer<fp16_t, N> result;
|
||||
static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_fp16[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
else if constexpr(N == 8)
|
||||
{
|
||||
// use fp32 load to mimic fp16 load
|
||||
@@ -1744,6 +1778,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence)));
|
||||
}
|
||||
else if constexpr(N == 6)
|
||||
{
|
||||
// N = 6: load as dwordx3 (12 bytes = 6 bf16), using buffer_load_dwordx3 instruction
|
||||
int32x3_t tmp_i32x3 =
|
||||
llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
|
||||
src_thread_addr_offset,
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence));
|
||||
|
||||
// Use union to reinterpret int32x3 as bf16x6
|
||||
dwordx3_union tmp_union;
|
||||
tmp_union.as_i32[0] = tmp_i32x3[0];
|
||||
tmp_union.as_i32[1] = tmp_i32x3[1];
|
||||
tmp_union.as_i32[2] = tmp_i32x3[2];
|
||||
|
||||
thread_buffer<bf16_t, N> result;
|
||||
static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_bf16[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
else if constexpr(N == 8)
|
||||
{
|
||||
int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
||||
|
||||
@@ -989,6 +989,20 @@ llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
|
||||
index_t soffset,
|
||||
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
|
||||
|
||||
// dwordx3 - use union to convert between int32x3 and fp16/bf16 types
|
||||
union dwordx3_union
|
||||
{
|
||||
int32_t as_i32[3];
|
||||
fp16_t as_fp16[6];
|
||||
bf16_t as_bf16[6];
|
||||
};
|
||||
|
||||
CK_TILE_DEVICE_EXTERN int32x3_t
|
||||
llvm_amdgcn_raw_buffer_load_i32x3(int32x4_t srsrc,
|
||||
index_t voffset,
|
||||
index_t soffset,
|
||||
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v3i32");
|
||||
|
||||
CK_TILE_DEVICE_EXTERN int32x4_t
|
||||
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
|
||||
index_t voffset,
|
||||
@@ -1408,9 +1422,9 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
|
||||
(std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
|
||||
(std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
||||
(std::is_same<T, fp16_t>::value &&
|
||||
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
|
||||
(N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
|
||||
(std::is_same<T, bf16_t>::value &&
|
||||
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
|
||||
(N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
|
||||
(std::is_same<T, int32_t>::value &&
|
||||
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
||||
(std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
||||
@@ -1529,6 +1543,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence)));
|
||||
}
|
||||
else if constexpr(N == 6)
|
||||
{
|
||||
// N = 6: load as dwordx3 (12 bytes = 6 fp16), using buffer_load_dwordx3 instruction
|
||||
int32x3_t tmp_i32x3 =
|
||||
llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
|
||||
src_thread_addr_offset,
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence));
|
||||
|
||||
// Use union to reinterpret int32x3 as fp16x6
|
||||
dwordx3_union tmp_union;
|
||||
tmp_union.as_i32[0] = tmp_i32x3[0];
|
||||
tmp_union.as_i32[1] = tmp_i32x3[1];
|
||||
tmp_union.as_i32[2] = tmp_i32x3[2];
|
||||
|
||||
thread_buffer<fp16_t, N> result;
|
||||
static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_fp16[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
else
|
||||
{
|
||||
// N >= 8: build from fp32x4 chunks
|
||||
@@ -1571,6 +1605,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence)));
|
||||
}
|
||||
else if constexpr(N == 6)
|
||||
{
|
||||
// N = 6: load as dwordx3 (12 bytes = 6 bf16), using buffer_load_dwordx3 instruction
|
||||
int32x3_t tmp_i32x3 =
|
||||
llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
|
||||
src_thread_addr_offset,
|
||||
src_wave_addr_offset,
|
||||
static_cast<index_t>(coherence));
|
||||
|
||||
// Use union to reinterpret int32x3 as bf16x6
|
||||
dwordx3_union tmp_union;
|
||||
tmp_union.as_i32[0] = tmp_i32x3[0];
|
||||
tmp_union.as_i32[1] = tmp_i32x3[1];
|
||||
tmp_union.as_i32[2] = tmp_i32x3[2];
|
||||
|
||||
thread_buffer<bf16_t, N> result;
|
||||
static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_bf16[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
else
|
||||
{
|
||||
// N >= 8: build from fp32x4 chunks
|
||||
|
||||
@@ -152,6 +152,7 @@ using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
|
||||
// i32
|
||||
// using int32_t = ...
|
||||
using int32x2_t = int32_t __attribute__((ext_vector_type(2)));
|
||||
using int32x3_t = int32_t __attribute__((ext_vector_type(3)));
|
||||
using int32x4_t = int32_t __attribute__((ext_vector_type(4)));
|
||||
using int32x8_t = int32_t __attribute__((ext_vector_type(8)));
|
||||
using int32x16_t = int32_t __attribute__((ext_vector_type(16)));
|
||||
|
||||
@@ -12,6 +12,8 @@ static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length()
|
||||
{
|
||||
if constexpr(Headdim == 48)
|
||||
return 48;
|
||||
else if constexpr(Headdim == 80)
|
||||
return 96;
|
||||
else if constexpr(Headdim == 96)
|
||||
return 128;
|
||||
else if constexpr(Headdim == 160)
|
||||
|
||||
@@ -25,7 +25,7 @@ GPU_TARGETS="gfx908;gfx90a;gfx942"
|
||||
if [ $# -ge 1 ]; then
|
||||
case "$1" in
|
||||
gfx*)
|
||||
GPU_TARGETS=$1
|
||||
GPU_TARGETS="$1"
|
||||
shift 1
|
||||
echo "GPU targets provided: $GPU_TARGETS"
|
||||
REST_ARGS=("$@")
|
||||
@@ -44,8 +44,8 @@ cmake
|
||||
-D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker -fbracket-depth=512" \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D BUILD_DEV=ON \
|
||||
-D GPU_TARGETS=$GPU_TARGETS \
|
||||
-D GPU_TARGETS="$GPU_TARGETS" \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
-D USE_BITINT_EXTENSION_INT4=OFF \
|
||||
"${REST_ARGS[@]}" \ \
|
||||
${MY_PROJECT_SOURCE}
|
||||
"${REST_ARGS[@]}" \
|
||||
"${MY_PROJECT_SOURCE}"
|
||||
|
||||
119
script/monitor_sccache_during_build.sh
Normal file
119
script/monitor_sccache_during_build.sh
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# Continuous monitoring script for sccache during builds
|
||||
# Usage: ./monitor_sccache_during_build.sh [log_prefix] &
|
||||
|
||||
LOG_PREFIX=${1:-"sccache_monitor"}
|
||||
|
||||
# Include stage name in log filename if available
|
||||
STAGE_SUFFIX=""
|
||||
if [ -n "${STAGE_NAME}" ]; then
|
||||
# Convert stage name to filename-safe format (replace spaces and special chars with underscores)
|
||||
STAGE_SAFE=$(echo "${STAGE_NAME}" | sed 's/[^a-zA-Z0-9]/_/g' | sed 's/__*/_/g' | sed 's/^_\|_$//g')
|
||||
STAGE_SUFFIX="_${STAGE_SAFE}"
|
||||
fi
|
||||
|
||||
MONITOR_LOG="logs/${LOG_PREFIX}_$(date +%Y%m%d_%H%M%S)${STAGE_SUFFIX}.log"
|
||||
MONITOR_INTERVAL=30 # seconds
|
||||
|
||||
echo "Starting sccache monitoring - logging to $MONITOR_LOG"
|
||||
echo "Monitor interval: $MONITOR_INTERVAL seconds"
|
||||
|
||||
# Function to log with timestamp
|
||||
log_with_timestamp() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$MONITOR_LOG"
|
||||
}
|
||||
|
||||
# Function to get sccache stats safely
|
||||
get_sccache_stats() {
|
||||
if command -v sccache &> /dev/null; then
|
||||
sccache --show-stats 2>/dev/null || echo "sccache stats unavailable"
|
||||
else
|
||||
echo "sccache command not found"
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to check if sccache server is running
|
||||
is_sccache_running() {
|
||||
if command -v sccache &> /dev/null; then
|
||||
sccache --show-stats &> /dev/null
|
||||
return $?
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to test Redis connectivity
|
||||
test_redis_connectivity() {
|
||||
# Use SCCACHE_REDIS if set, otherwise construct from CK_SCCACHE
|
||||
local REDIS_URL=""
|
||||
if [ -n "${SCCACHE_REDIS}" ]; then
|
||||
REDIS_URL="${SCCACHE_REDIS}"
|
||||
elif [ -n "${CK_SCCACHE}" ]; then
|
||||
REDIS_URL="redis://${CK_SCCACHE}"
|
||||
fi
|
||||
|
||||
if [ -n "${REDIS_URL}" ]; then
|
||||
local start_time=$(date +%s%N)
|
||||
local response=$(timeout 5 redis-cli -u "${REDIS_URL}" ping 2>&1) || response="TIMEOUT"
|
||||
local end_time=$(date +%s%N)
|
||||
local latency=$(( (end_time - start_time) / 1000000 ))
|
||||
echo "Redis: $response (${latency}ms)"
|
||||
else
|
||||
echo "Redis: No Redis URL available"
|
||||
fi
|
||||
}
|
||||
|
||||
# Gets the last sccache stats before exiting
|
||||
cleanup() {
|
||||
log_with_timestamp "=== FINAL SCCACHE STATS EXIT ==="
|
||||
log_with_timestamp "$(get_sccache_stats)"
|
||||
echo "=== CONTINUOUS MONITORING STOPPED ==="
|
||||
# List monitoring logs
|
||||
echo "=== MONITORING LOGS ==="
|
||||
ls -la logs/*monitor*.log 2>/dev/null || echo "No monitoring logs found"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
log_with_timestamp "=== SCCACHE MONITORING STARTED ==="
|
||||
log_with_timestamp "PID: $$"
|
||||
log_with_timestamp "Node: ${NODE_NAME:-$(hostname)}"
|
||||
log_with_timestamp "Stage: ${STAGE_NAME:-unknown}"
|
||||
log_with_timestamp "WORKSPACE_PATH: ${WORKSPACE:-not set}"
|
||||
log_with_timestamp "SCCACHE_C_CUSTOM_CACHE_BUSTER: ${SCCACHE_C_CUSTOM_CACHE_BUSTER:-not set}"
|
||||
log_with_timestamp "CK_SCCACHE: ${CK_SCCACHE:-not set}"
|
||||
|
||||
# Initial state
|
||||
log_with_timestamp "=== INITIAL STATE ==="
|
||||
# Reset sscache stats
|
||||
sccache --zero-stats
|
||||
log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
|
||||
|
||||
# Monitor loop
|
||||
while true; do
|
||||
sleep $MONITOR_INTERVAL
|
||||
|
||||
# Check if sccache server is still running
|
||||
if ! is_sccache_running; then
|
||||
log_with_timestamp "WARNING: sccache server not running!"
|
||||
fi
|
||||
|
||||
# Get current stats
|
||||
current_stats=$(get_sccache_stats)
|
||||
redis_status=$(test_redis_connectivity)
|
||||
|
||||
# Log current cache hit information
|
||||
log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
|
||||
|
||||
# Check for Redis latency issues
|
||||
if echo "$redis_status" | grep -E "[0-9]{3,}" > /dev/null; then # >100ms latency
|
||||
log_with_timestamp "HIGH REDIS LATENCY detected"
|
||||
fi
|
||||
|
||||
# Check for Redis connection failures
|
||||
if echo "$redis_status" | grep -E "(TIMEOUT|Connection refused|No route)" > /dev/null; then
|
||||
log_with_timestamp "REDIS CONNECTION FAILURE detected"
|
||||
fi
|
||||
done
|
||||
Reference in New Issue
Block a user