From c5faecd894c2d3122ae2bc29fe98da54874426a7 Mon Sep 17 00:00:00 2001
From: "assistant-librarian[bot]"
 <assistant-librarian[bot]@users.noreply.github.com>
Date: Wed, 17 Dec 2025 17:16:17 +0000
Subject: [PATCH] Merge commit '92653168c2b276d4467320f5bdff5ec6cbddf4e6' into
 develop

---
 CHANGELOG.md                                  |  25 ++--
 Jenkinsfile                                   |  41 +++++-
 codegen/CMakeLists.txt                        |   1 +
 example/ck_tile/01_fmha/CMakeLists.txt        |   2 +-
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  20 ++-
 .../core/arch/amd_buffer_addressing.hpp       |  58 ++++++++-
 .../arch/amd_buffer_addressing_builtins.hpp   |  58 ++++++++-
 include/ck_tile/core/numeric/vector_type.hpp  |   1 +
 .../ops/fmha/pipeline/tile_fmha_shape.hpp     |   2 +
 script/cmake-ck-dev.sh                        |   8 +-
 script/monitor_sccache_during_build.sh        | 119 ++++++++++++++++++
 11 files changed, 309 insertions(+), 26 deletions(-)
 create mode 100644 script/monitor_sccache_during_build.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a69ce2260e..d9fad8c6d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,21 +17,22 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ## Composable Kernel 1.2.0 for ROCm 7.2.0
 
 ### Added
+* Added support for fp8 dynamic tensor-wise quantization of fp8 fmha fwd kernel.
 * Added support for bf16 data type to grouped_gemm and grouped_gemm_preshuffle.
 * Added Col-Col-Row-Col layout support for aquant mode in blockscale GEMM.
-* Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM
-* Added a compute async pipeline in the CK TILE universal GEMM on gfx950
-* Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
+* Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM.
+* Added a compute async pipeline in the CK Tile universal GEMM on gfx950.
+* Added support for B Tensor type `pk_int4_t` in the CK Tile weight preshuffle GEMM.
 * Added the new api to load different memory sizes to SGPR.
-* Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
+* Added support for B Tensor preshuffle in CK Tile grouped GEMM.
 * Added a basic copy kernel example and supporting documentation for new CK Tile developers.
-* Added support for grouped_gemm kernels to perform multi_d elementwise operation.
-* Added support for Multiple ABD GEMM
+* Added support for grouped GEMM kernels to perform Multi D elementwise operation.
+* Added support for multiple ABD GEMM.
 * Added benchmarking support for tile engine GEMM Multi D.
-* Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
-* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
-* Added support for f32 to FMHA (fwd/bwd).
-* Added tensor-wise quantization for CK_TILE GEMM.
+* Added block scaling support in CK Tile GEMM, allowing flexible use of quantization matrices from either A or B operands.
+* Added the row-wise column-wise quantization for CK Tile GEMM and CK Tile grouped GEMM.
+* Added support for f32 to FMHA (forward and backward).
+* Added tensor-wise quantization for CK Tile GEMM.
 * Added support for batched contraction kernel.
 * Added WMMA (gfx12) support for FMHA.
 * Added pooling kernel in CK_TILE
@@ -41,7 +42,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 ### Changed
 
-* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK_TILE (#2594)
+* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK Tile (#2594)
 * Added an optional template parameter `Arch` (`gfx9_t`, `gfx12_t` etc.) to `make_kernel` to support linking multiple object files that have the same kernel compiled for different architectures.
 * FMHA examples and tests can be built for multiple architectures (gfx9, gfx950, gfx12) at the same time.
 
@@ -91,7 +92,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ### Optimized
 
 * Optimize the gemm multiply multiply preshuffle & lds bypass with Pack of KGroup and better instruction layout.
-* Added Vectorize Transpose optimization for CK Tile 
+* Added Vectorize Transpose optimization for CK Tile
 * Added the asynchronous copy for gfx950
 
 ### Changed
diff --git a/Jenkinsfile b/Jenkinsfile
index 2a1d1fd904..cf4f13eff1 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -622,8 +622,45 @@ def cmake_build(Map conf=[:]){
     echo cmd
 
     dir("build"){
-        //build CK
-        sh cmd
+        // Start sccache monitoring
+        if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
+            sh """
+                chmod +x ../script/monitor_sccache_during_build.sh
+                mkdir -p logs
+                export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
+                ../script/monitor_sccache_during_build.sh build_monitor &
+                MONITOR_PID=\$!
+                echo "Monitor PID: \$MONITOR_PID"
+                echo \$MONITOR_PID > monitor.pid
+            """
+        }
+        try {
+            //build CK
+            sh cmd
+        } catch (Exception buildError) {
+            echo "Build failed: ${buildError.getMessage()}"
+            throw buildError
+        } finally {
+            // Stop sccache monitoring
+            if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
+                sh """
+                    # Stop monitoring
+                    if [ -f monitor.pid ]; then
+                        MONITOR_PID=\$(cat monitor.pid)
+                        kill \$MONITOR_PID 2>/dev/null || echo "Monitor already stopped"
+                        rm -f monitor.pid
+                    fi
+                """
+                
+                // Archive the monitoring logs
+                try {
+                    archiveArtifacts artifacts: "logs/*monitor*.log", allowEmptyArchive: true
+                } catch (Exception e) {
+                    echo "Could not archive sccache monitoring logs: ${e.getMessage()}"
+                }
+            }
+        }
+
         //run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
         if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
             sh "python3 ../script/ninja_json_converter.py .ninja_log --legacy-format --output ck_build_trace_${check_arch_name()}.json"
diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 22d8e58d10..69a6a71de2 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -15,6 +15,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
 find_package(ROCM)
 include(ROCMInstallTargets)
 include(ROCMTest)
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm $ENV{ROCM_PATH})
 find_package(hiprtc REQUIRED)
 
 rocm_setup_version(VERSION 1.0)
diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index 9c81207361..fbd6551091 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -47,7 +47,7 @@ set(FMHA_FWD_CODE_GEN_COMMON_ARGS
   ${CMAKE_CURRENT_LIST_DIR}/generate.py
   --targets ${FMHA_TARGETS_ARG}
   --api ${FMHA_FWD_APIS}
-  --optdim 32,64,128,256
+  --optdim 32,64,80,128,256
   # --filter fmha_fwd...
 )
 set(FMHA_BWD_CODE_GEN_COMMON_ARGS
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 4d6900a802..0cffb2642c 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -40,7 +40,16 @@ DTYPE_BITS = {
     "bf8": 8,
 }
 
-K0_MAX_SUBMAX_MAP = {32: 32, 48: 48, 64: 64, 96: 128, 128: 128, 192: 192, 256: 256}
+K0_MAX_SUBMAX_MAP = {
+    32: 32,
+    48: 48,
+    64: 64,
+    80: 96,
+    96: 128,
+    128: 128,
+    192: 192,
+    256: 256,
+}
 
 FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.\n
@@ -930,6 +939,7 @@ class KernelComponentFactoryGfx9(CompatibilityRuleFactoryGfx9):
                 ( 64,  64) : [FmhaFwdTileSize( 16,  32,  64,  64,  32,  64,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
                               FmhaFwdTileSize( 32,  32,  64,  64,  32,  64,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
                               FmhaFwdTileSize(128,  64,  32,  64,  32,  64,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                ( 80, 96)  : [FmhaFwdTileSize(128, 128,  16,  96,  32,  80,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 ( 96, 128) : [FmhaFwdTileSize(128, 128,  32, 128,  32,  96,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (128, 128) : [FmhaFwdTileSize( 16,  32,  64, 128,  32, 128,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
                               FmhaFwdTileSize( 32,  32, 128, 128,  32, 128,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
@@ -1014,8 +1024,12 @@ class KernelComponentFactoryGfx9(CompatibilityRuleFactoryGfx9):
                 ["no"],
                 ["f", "t"],
             ):
-                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
+                if hdim == 64:
+                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
+                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
+                else:
+                    pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
+                    pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
         elif dtype in ["fp8", "fp8fp16", "bf8"]:
             # TODO
             pass
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 9c2ce62856..9f79bdbee6 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -1121,6 +1121,20 @@ llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
 
+// dwordx3 - use union to convert between int32x3 and fp16/bf16 types
+union dwordx3_union
+{
+    int32_t as_i32[3];
+    fp16_t as_fp16[6];
+    bf16_t as_bf16[6];
+};
+
+CK_TILE_DEVICE_EXTERN int32x3_t
+llvm_amdgcn_raw_buffer_load_i32x3(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v3i32");
+
 CK_TILE_DEVICE_EXTERN int32x4_t
 llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                   index_t voffset,
@@ -1540,9 +1554,9 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
         (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
             (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp16_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
+             (N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
             (std::is_same<T, bf16_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
+             (N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
             (std::is_same<T, int32_t>::value &&
              (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
@@ -1659,6 +1673,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
                                                    src_wave_addr_offset,
                                                    static_cast<index_t>(coherence)));
         }
+        else if constexpr(N == 6)
+        {
+            // N = 6: load as dwordx3 (12 bytes = 6 fp16), using buffer_load_dwordx3 instruction
+            int32x3_t tmp_i32x3 =
+                llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
+
+            // Use union to reinterpret int32x3 as fp16x6
+            dwordx3_union tmp_union;
+            tmp_union.as_i32[0] = tmp_i32x3[0];
+            tmp_union.as_i32[1] = tmp_i32x3[1];
+            tmp_union.as_i32[2] = tmp_i32x3[2];
+
+            thread_buffer<fp16_t, N> result;
+            static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_fp16[i]; });
+
+            return result;
+        }
         else if constexpr(N == 8)
         {
             // use fp32 load to mimic fp16 load
@@ -1744,6 +1778,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
                                                   src_wave_addr_offset,
                                                   static_cast<index_t>(coherence)));
         }
+        else if constexpr(N == 6)
+        {
+            // N = 6: load as dwordx3 (12 bytes = 6 bf16), using buffer_load_dwordx3 instruction
+            int32x3_t tmp_i32x3 =
+                llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
+
+            // Use union to reinterpret int32x3 as bf16x6
+            dwordx3_union tmp_union;
+            tmp_union.as_i32[0] = tmp_i32x3[0];
+            tmp_union.as_i32[1] = tmp_i32x3[1];
+            tmp_union.as_i32[2] = tmp_i32x3[2];
+
+            thread_buffer<bf16_t, N> result;
+            static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_bf16[i]; });
+
+            return result;
+        }
         else if constexpr(N == 8)
         {
             int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 764df83539..4627b249d6 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -989,6 +989,20 @@ llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
 
+// dwordx3 - use union to convert between int32x3 and fp16/bf16 types
+union dwordx3_union
+{
+    int32_t as_i32[3];
+    fp16_t as_fp16[6];
+    bf16_t as_bf16[6];
+};
+
+CK_TILE_DEVICE_EXTERN int32x3_t
+llvm_amdgcn_raw_buffer_load_i32x3(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v3i32");
+
 CK_TILE_DEVICE_EXTERN int32x4_t
 llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                   index_t voffset,
@@ -1408,9 +1422,9 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
         (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
             (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp16_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
+             (N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
             (std::is_same<T, bf16_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
+             (N == 1 || N == 2 || N == 4 || N == 6 || N == 8 || N == 16 || N == 32)) ||
             (std::is_same<T, int32_t>::value &&
              (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
@@ -1529,6 +1543,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
                                                    src_wave_addr_offset,
                                                    static_cast<index_t>(coherence)));
         }
+        else if constexpr(N == 6)
+        {
+            // N = 6: load as dwordx3 (12 bytes = 6 fp16), using buffer_load_dwordx3 instruction
+            int32x3_t tmp_i32x3 =
+                llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
+
+            // Use union to reinterpret int32x3 as fp16x6
+            dwordx3_union tmp_union;
+            tmp_union.as_i32[0] = tmp_i32x3[0];
+            tmp_union.as_i32[1] = tmp_i32x3[1];
+            tmp_union.as_i32[2] = tmp_i32x3[2];
+
+            thread_buffer<fp16_t, N> result;
+            static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_fp16[i]; });
+
+            return result;
+        }
         else
         {
             // N >= 8: build from fp32x4 chunks
@@ -1571,6 +1605,26 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
                                                   src_wave_addr_offset,
                                                   static_cast<index_t>(coherence)));
         }
+        else if constexpr(N == 6)
+        {
+            // N = 6: load as dwordx3 (12 bytes = 6 bf16), using buffer_load_dwordx3 instruction
+            int32x3_t tmp_i32x3 =
+                llvm_amdgcn_raw_buffer_load_i32x3(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence));
+
+            // Use union to reinterpret int32x3 as bf16x6
+            dwordx3_union tmp_union;
+            tmp_union.as_i32[0] = tmp_i32x3[0];
+            tmp_union.as_i32[1] = tmp_i32x3[1];
+            tmp_union.as_i32[2] = tmp_i32x3[2];
+
+            thread_buffer<bf16_t, N> result;
+            static_for<0, N, 1>{}([&](auto i) { result[i] = tmp_union.as_bf16[i]; });
+
+            return result;
+        }
         else
         {
             // N >= 8: build from fp32x4 chunks
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index 6921210b34..90ddc2a56e 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -152,6 +152,7 @@ using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
 // i32
 // using int32_t = ...
 using int32x2_t  = int32_t __attribute__((ext_vector_type(2)));
+using int32x3_t  = int32_t __attribute__((ext_vector_type(3)));
 using int32x4_t  = int32_t __attribute__((ext_vector_type(4)));
 using int32x8_t  = int32_t __attribute__((ext_vector_type(8)));
 using int32x16_t = int32_t __attribute__((ext_vector_type(16)));
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index ee5238869f..4045e31b17 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -12,6 +12,8 @@ static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length()
 {
     if constexpr(Headdim == 48)
         return 48;
+    else if constexpr(Headdim == 80)
+        return 96;
     else if constexpr(Headdim == 96)
         return 128;
     else if constexpr(Headdim == 160)
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 9643af1de0..31d724deb6 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -25,7 +25,7 @@ GPU_TARGETS="gfx908;gfx90a;gfx942"
 if [ $# -ge 1 ]; then
     case "$1" in
         gfx*)
-            GPU_TARGETS=$1
+            GPU_TARGETS="$1"
             shift 1
             echo "GPU targets provided: $GPU_TARGETS"
             REST_ARGS=("$@")
@@ -44,8 +44,8 @@ cmake
 -D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker -fbracket-depth=512" \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
--D GPU_TARGETS=$GPU_TARGETS                                                                       \
+-D GPU_TARGETS="$GPU_TARGETS"                                                                     \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
-"${REST_ARGS[@]}"                                                                                 \                                                                                     \
-${MY_PROJECT_SOURCE}
+"${REST_ARGS[@]}"                                                                                 \
+"${MY_PROJECT_SOURCE}"
diff --git a/script/monitor_sccache_during_build.sh b/script/monitor_sccache_during_build.sh
new file mode 100644
index 0000000000..3f52d73c4c
--- /dev/null
+++ b/script/monitor_sccache_during_build.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# Continuous monitoring script for sccache during builds
+# Usage: ./monitor_sccache_during_build.sh [log_prefix] &
+
+LOG_PREFIX=${1:-"sccache_monitor"}
+
+# Include stage name in log filename if available
+STAGE_SUFFIX=""
+if [ -n "${STAGE_NAME}" ]; then
+    # Convert stage name to filename-safe format (replace spaces and special chars with underscores)
+    STAGE_SAFE=$(echo "${STAGE_NAME}" | sed 's/[^a-zA-Z0-9]/_/g' | sed 's/__*/_/g' | sed 's/^_\|_$//g')
+    STAGE_SUFFIX="_${STAGE_SAFE}"
+fi
+
+MONITOR_LOG="logs/${LOG_PREFIX}_$(date +%Y%m%d_%H%M%S)${STAGE_SUFFIX}.log"
+MONITOR_INTERVAL=30  # seconds
+
+echo "Starting sccache monitoring - logging to $MONITOR_LOG"
+echo "Monitor interval: $MONITOR_INTERVAL seconds"
+
+# Function to log with timestamp
+log_with_timestamp() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$MONITOR_LOG"
+}
+
+# Function to get sccache stats safely
+get_sccache_stats() {
+    if command -v sccache &> /dev/null; then
+        sccache --show-stats 2>/dev/null || echo "sccache stats unavailable"
+    else
+        echo "sccache command not found"
+    fi
+}
+
+# Function to check if sccache server is running
+is_sccache_running() {
+    if command -v sccache &> /dev/null; then
+        sccache --show-stats &> /dev/null
+        return $?
+    else
+        return 1
+    fi
+}
+
+# Function to test Redis connectivity
+test_redis_connectivity() {
+    # Use SCCACHE_REDIS if set, otherwise construct from CK_SCCACHE
+    local REDIS_URL=""
+    if [ -n "${SCCACHE_REDIS}" ]; then
+        REDIS_URL="${SCCACHE_REDIS}"
+    elif [ -n "${CK_SCCACHE}" ]; then
+        REDIS_URL="redis://${CK_SCCACHE}"
+    fi
+    
+    if [ -n "${REDIS_URL}" ]; then
+        local start_time=$(date +%s%N)
+        local response=$(timeout 5 redis-cli -u "${REDIS_URL}" ping 2>&1) || response="TIMEOUT"
+        local end_time=$(date +%s%N)
+        local latency=$(( (end_time - start_time) / 1000000 ))
+        echo "Redis: $response (${latency}ms)"
+    else
+        echo "Redis: No Redis URL available"
+    fi
+}
+
+# Gets the last sccache stats before exiting
+cleanup() {
+    log_with_timestamp "=== FINAL SCCACHE STATS EXIT ==="
+    log_with_timestamp "$(get_sccache_stats)"
+    echo "=== CONTINUOUS MONITORING STOPPED ==="
+    # List monitoring logs
+    echo "=== MONITORING LOGS ==="
+    ls -la logs/*monitor*.log 2>/dev/null || echo "No monitoring logs found"
+}
+trap cleanup EXIT
+
+log_with_timestamp "=== SCCACHE MONITORING STARTED ==="
+log_with_timestamp "PID: $$"
+log_with_timestamp "Node: ${NODE_NAME:-$(hostname)}"
+log_with_timestamp "Stage: ${STAGE_NAME:-unknown}"
+log_with_timestamp "WORKSPACE_PATH: ${WORKSPACE:-not set}"
+log_with_timestamp "SCCACHE_C_CUSTOM_CACHE_BUSTER: ${SCCACHE_C_CUSTOM_CACHE_BUSTER:-not set}"
+log_with_timestamp "CK_SCCACHE: ${CK_SCCACHE:-not set}"
+
+# Initial state
+log_with_timestamp "=== INITIAL STATE ==="
+# Reset sscache stats
+sccache --zero-stats
+log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
+
+# Monitor loop
+while true; do
+    sleep $MONITOR_INTERVAL
+    
+    # Check if sccache server is still running
+    if ! is_sccache_running; then
+        log_with_timestamp "WARNING: sccache server not running!"
+    fi
+    
+    # Get current stats
+    current_stats=$(get_sccache_stats)
+    redis_status=$(test_redis_connectivity)
+    
+    # Log current cache hit information
+    log_with_timestamp "$(get_sccache_stats) $(test_redis_connectivity)"
+    
+    # Check for Redis latency issues
+    if echo "$redis_status" | grep -E "[0-9]{3,}" > /dev/null; then  # >100ms latency
+        log_with_timestamp "HIGH REDIS LATENCY detected"
+    fi
+    
+    # Check for Redis connection failures
+    if echo "$redis_status" | grep -E "(TIMEOUT|Connection refused|No route)" > /dev/null; then
+        log_with_timestamp "REDIS CONNECTION FAILURE detected"
+    fi
+done
\ No newline at end of file