diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index e626603949..783fc661ce 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -3,7 +3,6 @@
 
 #pragma once
 #include "data_type.hpp"
-#include "amd_inline_asm.hpp"
 
 namespace ck {
 
@@ -1065,48 +1064,4 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
 }
 #endif
 
-template <index_t N>
-__device__ typename vector_type<int8_t, N>::type
-amd_s_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
-                           index_t src_wave_addr_offset)
-{
-    static_assert(N == 4 || N == 8, "wrong! not implemented");
-    // TODO: add other variants of s_buffer_load
-    if constexpr(N == 4)
-    {
-        int32_t tmp =
-            amd_assembly_s_buffer_load_b32(src_wave_buffer_resource, src_wave_addr_offset);
-        return bit_cast<int8x4_t>(tmp);
-    }
-    else if constexpr(N == 8)
-    {
-        int32x2_t tmp =
-            amd_assembly_s_buffer_load_b64(src_wave_buffer_resource, src_wave_addr_offset);
-        return bit_cast<int8x8_t>(tmp);
-    }
-}
-
-template <typename T, index_t N>
-__device__ typename vector_type<T, N>::type
-amd_s_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
-                       index_t src_wave_addr_offset)
-{
-    static_assert((is_same<T, double>::value && (N == 1)) ||
-                      (is_same<T, float>::value && (N == 1 || N == 2)) ||
-                      (is_same<T, half_t>::value && (N == 2 || N == 4)) ||
-                      (is_same<T, bhalf_t>::value && (N == 2 || N == 4)) ||
-                      (is_same<T, int32_t>::value && (N == 1 || N == 2)) ||
-                      (is_same<T, f8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, bf8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, int8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, uint8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, pk_i4_t>::value && (N == 4 || N == 8)),
-                  "wrong! not implemented");
-
-    using r_t = typename vector_type<T, N>::type;
-    auto raw_data =
-        amd_s_buffer_load_impl_raw<sizeof(T) * N>(src_wave_buffer_resource, src_wave_addr_offset);
-    return bit_cast<r_t>(raw_data);
-}
-
 } // namespace ck
diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index 06a4ec199d..f642e06050 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -3,7 +3,6 @@
 
 #pragma once
 #include "data_type.hpp"
-#include "amd_inline_asm.hpp"
 
 namespace ck {
 
@@ -886,48 +885,4 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
 }
 #endif
 
-template <index_t N>
-__device__ typename vector_type<int8_t, N>::type
-amd_s_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
-                           index_t src_wave_addr_offset)
-{
-    static_assert(N == 4 || N == 8, "wrong! not implemented");
-    // TODO: add other variants of s_buffer_load
-    if constexpr(N == 4)
-    {
-        int32_t tmp =
-            amd_assembly_s_buffer_load_b32(src_wave_buffer_resource, src_wave_addr_offset);
-        return bit_cast<int8x4_t>(tmp);
-    }
-    else if constexpr(N == 8)
-    {
-        int32x2_t tmp =
-            amd_assembly_s_buffer_load_b64(src_wave_buffer_resource, src_wave_addr_offset);
-        return bit_cast<int8x8_t>(tmp);
-    }
-}
-
-template <typename T, index_t N>
-__device__ typename vector_type<T, N>::type
-amd_s_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
-                       index_t src_wave_addr_offset)
-{
-    static_assert((is_same<T, double>::value && (N == 1)) ||
-                      (is_same<T, float>::value && (N == 1 || N == 2)) ||
-                      (is_same<T, half_t>::value && (N == 2 || N == 4)) ||
-                      (is_same<T, bhalf_t>::value && (N == 2 || N == 4)) ||
-                      (is_same<T, int32_t>::value && (N == 1 || N == 2)) ||
-                      (is_same<T, f8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, bf8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, int8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, uint8_t>::value && (N == 4 || N == 8)) ||
-                      (is_same<T, pk_i4_t>::value && (N == 4 || N == 8)),
-                  "wrong! not implemented");
-
-    using r_t = typename vector_type<T, N>::type;
-    auto raw_data =
-        amd_s_buffer_load_impl_raw<sizeof(T) * N>(src_wave_buffer_resource, src_wave_addr_offset);
-    return bit_cast<r_t>(raw_data);
-}
-
 } // namespace ck
diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp
index e9f9e407d6..79efd77edb 100644
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -432,28 +432,5 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
 }
 #endif
 
-// s_buffer_loads
-inline __device__ int32_t
-amd_assembly_s_buffer_load_b32(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, unsigned int offset)
-{
-    int32_t result;
-    asm volatile("s_buffer_load_b32 %0, %1, %2"
-                 : "=s"(result)
-                 : "s"(src_wave_buffer_resource), "s"(offset)
-                 : "memory");
-    return result;
-}
-
-inline __device__ int32x2_t
-amd_assembly_s_buffer_load_b64(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, unsigned int offset)
-{
-    int32x2_t result;
-    asm volatile("s_buffer_load_b64 %0, %1, %2"
-                 : "=s"(result)
-                 : "s"(src_wave_buffer_resource), "s"(offset)
-                 : "memory");
-    return result;
-}
-
 } // namespace ck
 #endif
diff --git a/test/s_prefetch_op/s_prefetch_op.cpp b/test/s_prefetch_op/s_prefetch_op.cpp
index 1ec3e57794..fc0ae84132 100644
--- a/test/s_prefetch_op/s_prefetch_op.cpp
+++ b/test/s_prefetch_op/s_prefetch_op.cpp
@@ -1,54 +1,44 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-#include <numeric>
-#include <tuple>
-#include <vector>
-#include <chrono>
-
 #include "ck/ck.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/host_utility/hip_check_error.hpp"
 #include "ck/host_utility/device_prop.hpp"
 
-#include <hip/hip_runtime.h>
-
-#if __clang_major__ >= 20
-#include "ck/utility/amd_buffer_addressing_builtins.hpp"
-#else
-#include "ck/utility/amd_buffer_addressing.hpp"
-#endif
-
 #include "s_prefetch_op_util.hpp"
 
-template <typename T>
-bool run_test()
+template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
+bool run_test(bool time_kernels)
 {
     bool pass = true;
 
-    const auto s_prefetch_kernel = ck::s_prefetch_op_util::kernel_with_scalar_prefetch<T>;
-    const auto s_buffer_prefetch_kernel =
-        ck::s_prefetch_op_util::kernel_with_scalar_buffer_prefetch<T>;
+    const auto s_prefetch_kernel =
+        ck::s_prefetch_op_util::kernel_with_prefetch<T,
+                                                     NUM_THREADS,
+                                                     NUM_SCALARS,
+                                                     ck::s_prefetch_op_util::SPrefetchDataOp<T>>;
+    const auto s_buffer_prefetch_kernel = ck::s_prefetch_op_util::kernel_with_prefetch<
+        T,
+        NUM_THREADS,
+        NUM_SCALARS,
+        ck::s_prefetch_op_util::SBufferPrefetchDataOp<T, NUM_SCALARS>>;
 
     const auto prefetch_kernel_container =
         std::make_tuple(s_prefetch_kernel, s_buffer_prefetch_kernel);
 
     ck::static_for<0, 2, 1>{}([&](auto i) {
         std::string kernel_name = (i == 1 ? "s_buffer_prefetch" : "s_prefetch");
-        pass &= ck::s_prefetch_op_util::test_constant_prefetch_impl<
-            decltype(std::get<ck::Number<i>{}>(prefetch_kernel_container)),
-            T>(std::get<ck::Number<i>{}>(prefetch_kernel_container), kernel_name);
+
+        auto kernel = std::get<ck::Number<i>{}>(prefetch_kernel_container);
+
+        pass &= ck::s_prefetch_op_util::
+            test_prefetch_impl<decltype(kernel), T, NUM_THREADS, NUM_SCALARS>(
+                time_kernels, kernel, kernel_name);
     });
 
     return pass;
 }
 
-int main(int, char*[])
+int main(int argc, char* argv[])
 {
     if(!ck::is_gfx12_supported())
     {
@@ -56,13 +46,20 @@ int main(int, char*[])
         return 0;
     }
 
+    bool time_kernels = false;
+
+    if(argc == 2)
+    {
+        time_kernels = std::stoi(argv[1]);
+    }
+
     bool pass = true;
 
     std::cout << "=== Testing Constant Cache Prefetch ===" << std::endl;
 
     // Test different data types
-    pass &= run_test<float>();
-    pass &= run_test<double>();
+    pass &= run_test<float, 4096, 1024>(time_kernels);
+    pass &= run_test<double, 4096, 512>(time_kernels);
 
     std::cout << "TestConstantPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
     return pass ? 0 : 1;
diff --git a/test/s_prefetch_op/s_prefetch_op_util.hpp b/test/s_prefetch_op/s_prefetch_op_util.hpp
index e894baf677..077b876b1a 100644
--- a/test/s_prefetch_op/s_prefetch_op_util.hpp
+++ b/test/s_prefetch_op/s_prefetch_op_util.hpp
@@ -1,133 +1,123 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+
+#include <hip/hip_runtime.h>
+
 namespace ck {
 namespace s_prefetch_op_util {
 
-// Prefetch to constant cache using AMD builtin with chunks_to_prefetch(1..32: 1 chunk = 128B)
-template <typename T>
-__device__ __forceinline__ void prefetch_to_constant_cache(const T* addr,
-                                                           unsigned int chunks_to_prefetch)
+// Enable scalar prefetch in hardware (required on gfx12 before using s_prefetch)
+__device__ __forceinline__ void enable_scalar_prefetch()
 {
 #if defined(__gfx12__)
-    assert(chunks_to_prefetch > 0 && chunks_to_prefetch <= 32);
-    __builtin_amdgcn_s_prefetch_data(addr, chunks_to_prefetch - 1); // we need to pass 0..31
-#else
-    // ignore - not supported
-    (void)addr;
-    (void)chunks_to_prefetch;
-#endif
-}
-
-// Prefetch to constant cache using AMD builtin with chunks_to_prefetch(1..32: 1 chunk = 128B)
-template <unsigned int offset>
-__device__ __forceinline__ void prefetch_to_constant_cache(__amdgpu_buffer_rsrc_t buf_res,
-                                                           unsigned int chunks_to_prefetch)
-{
-#if defined(__gfx12__)
-    assert(chunks_to_prefetch > 0 && chunks_to_prefetch <= 32);
-    __builtin_amdgcn_s_buffer_prefetch_data(buf_res, offset, chunks_to_prefetch - 1);
-#else
-    // ignore - not supported
-    (void)buf_res;
-    (void)chunks_to_prefetch;
+    // SCALAR_PREFETCH_EN is bit 24 in MODE register (hwreg 1)
+    // Set 1 bit at offset 24 to value 1
+    __builtin_amdgcn_s_setreg(1 | (24 << 6), 1); // Set bit to 1
 #endif
 }
 
 template <typename T>
-__global__ void kernel_with_scalar_prefetch(const T* src,
-                                            T* dst,
-                                            const void CK_CONSTANT_ADDRESS_SPACE* scalar_data,
-                                            index_t num_elements,
-                                            index_t num_scalars)
+struct SPrefetchDataOp
 {
-    index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    // Prefetch to constant cache using AMD builtin with cachelines to prefetch(1..32)
+    __device__ __forceinline__ void operator()(const T CK_CONSTANT_ADDRESS_SPACE* addr,
+                                               unsigned int num_cachelines) const
+    {
+#if defined(__gfx12__)
+        assert(num_cachelines > 0 && num_cachelines <= 32);
+        __builtin_amdgcn_s_prefetch_data(addr, num_cachelines - 1); // we need to pass 0..31
+#else
+        // ignore - not supported
+        (void)addr;
+        (void)num_cachelines;
+#endif
+    }
+};
 
-    const T CK_CONSTANT_ADDRESS_SPACE* scalar_elems =
-        static_cast<const T CK_CONSTANT_ADDRESS_SPACE*>(scalar_data);
+template <typename T, uint32_t NUM_SCALARS>
+struct SBufferPrefetchDataOp
+{
+    // Prefetch to constant cache using AMD builtin with cachelines to prefetch(1..32)
+    __device__ __forceinline__ void operator()(const T CK_CONSTANT_ADDRESS_SPACE* addr,
+                                               unsigned int num_cachelines) const
+    {
+#if defined(__gfx12__)
+        __amdgpu_buffer_rsrc_t buf_res = make_wave_buffer_resource_new(addr, NUM_SCALARS);
+        assert(num_cachelines > 0 && num_cachelines <= 32);
+        __builtin_amdgcn_s_buffer_prefetch_data(buf_res, 0, num_cachelines - 1);
+#else
+        // ignore - not supported
+        (void)addr;
+        (void)num_cachelines;
+#endif
+    }
+};
 
-    // Calculate number of 128B chunks needed to cover num_scalars elements
-    constexpr index_t chunk_size_bytes   = 128;
-    constexpr index_t elements_per_chunk = chunk_size_bytes / sizeof(T);
-    unsigned int chunks_needed = (num_scalars + elements_per_chunk - 1) / elements_per_chunk;
+template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
+__global__ void kernel_with_prefetch(const T* src,
+                                     T* dst,
+                                     const T CK_CONSTANT_ADDRESS_SPACE* scalar_data,
+                                     bool enable_prefetch)
+{
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    // Prefetch all scalar data at once using chunks parameter
+    // Calculate number of 128B cachelines needed to cover num_scalars elements
+    constexpr index_t cachelineSize              = 128;
+    constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
+    constexpr unsigned int cachelinesNeeded =
+        (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
+
+    // Prefetch all scalar data at once
     if(threadIdx.x == 0)
     {
-        prefetch_to_constant_cache(scalar_elems, chunks_needed);
+        if(enable_prefetch)
+        {
+            enable_scalar_prefetch();
+        }
+        PrefetchOp{}(scalar_data, cachelinesNeeded);
     }
 
     T sum = 0;
-    if(tid < num_elements)
+    if(tid < NUM_THREADS)
     {
-        sum = src[tid]; // load from global mem to make sure prefetch finished
+        sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
+                        // finishs
     }
     __syncthreads(); // waits on loads from global mem
-    if(tid < num_elements)
+    if(tid < NUM_THREADS)
     {
         // Access prefetched scalar data
-        for(index_t i = 0; i < num_scalars; i++)
+        for(uint32_t i = 0; i < NUM_SCALARS; i++)
         {
-            sum += scalar_elems[i]; // should be fast due to scalars being preloaded
+            sum += scalar_data[i]; // should be fast due to scalars being preloaded
         }
 
         dst[tid] = sum;
     }
 }
 
-template <typename T>
-__global__ void
-kernel_with_scalar_buffer_prefetch(const T* src,
-                                   T* dst,
-                                   const void CK_CONSTANT_ADDRESS_SPACE* scalar_data,
-                                   index_t num_elements,
-                                   index_t num_scalars)
+template <typename PrefetchKernel, typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
+bool test_prefetch_impl(bool time_kernels,
+                        const PrefetchKernel& prefetch_kernel,
+                        const std::string& kernel_name)
 {
-    index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    const T CK_CONSTANT_ADDRESS_SPACE* scalar_elems =
-        static_cast<const T CK_CONSTANT_ADDRESS_SPACE*>(scalar_data);
-
-    // Calculate number of 128B chunks needed to cover num_scalars elements
-    constexpr index_t chunk_size_bytes   = 128;
-    constexpr index_t elements_per_chunk = chunk_size_bytes / sizeof(T);
-    unsigned int chunks_needed = (num_scalars + elements_per_chunk - 1) / elements_per_chunk;
-
-    __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
-        make_wave_buffer_resource_new(scalar_elems, num_scalars);
-
-    // Prefetch all scalar data at once using chunks parameter
-    if(threadIdx.x == 0)
-    {
-        prefetch_to_constant_cache<0>(src_wave_buffer_resource, chunks_needed);
-    }
-
-    T sum = 0;
-    if(tid < num_elements)
-    {
-        sum = src[tid]; // load from global mem to make sure prefetch finished
-    }
-    __syncthreads(); // waits on loads from global mem
-    if(tid < num_elements)
-    {
-        // Access prefetched scalar data
-        for(index_t i = 0; i < num_scalars; i++)
-        {
-            sum += amd_s_buffer_load_impl<T, 1>(
-                src_wave_buffer_resource,
-                i * sizeof(T)); // should be fast due to scalars being preloaded
-        }
-
-        dst[tid] = sum;
-    }
-}
-
-template <typename PrefetchKernel, typename T>
-bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel,
-                                 const std::string& kernel_name)
-{
-    constexpr index_t num_elements = 512;
-    constexpr index_t num_scalars  = 512;
+    // TODO: maybe add more prefetch instructions inside kernel to support more values
+    assert(NUM_SCALARS / sizeof(T) < (128 * 32));
+    constexpr index_t num_elements = NUM_THREADS;
+    constexpr index_t num_scalars  = NUM_SCALARS;
     constexpr index_t block_size   = 256;
     constexpr index_t grid_size    = (num_elements + block_size - 1) / block_size;
 
@@ -171,14 +161,75 @@ bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel,
     hipStream_t stream;
     hip_check_error(hipStreamCreate(&stream));
 
-    prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
-        static_cast<const T*>(d_src.GetDeviceBuffer()),
-        static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
-        cast_pointer_to_constant_address_space(d_scalar.GetDeviceBuffer()),
-        num_elements,
-        num_scalars);
+    if(time_kernels)
+    {
+        ck::static_for<0, 2, 1>{}([&](auto static_i) {
+            constexpr bool prefetch_enabled = static_i == 0;
+            std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl;
 
-    hip_check_error(hipStreamSynchronize(stream));
+            constexpr int num_warmup     = 1;
+            constexpr int num_iterations = 10;
+
+            // Warmup runs
+            for(int i = 0; i < num_warmup; i++)
+            {
+                prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
+                    static_cast<const T*>(d_src.GetDeviceBuffer()),
+                    static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
+                    cast_pointer_to_constant_address_space(
+                        static_cast<const T*>(d_scalar.GetDeviceBuffer())),
+                    prefetch_enabled);
+            }
+            hip_check_error(hipStreamSynchronize(stream));
+
+            // Performance measurement
+            hipEvent_t start, stop;
+            hip_check_error(hipEventCreate(&start));
+            hip_check_error(hipEventCreate(&stop));
+
+            hip_check_error(hipEventRecord(start, stream));
+            for(int i = 0; i < num_iterations; i++)
+            {
+                prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
+                    static_cast<const T*>(d_src.GetDeviceBuffer()),
+                    static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
+                    cast_pointer_to_constant_address_space(
+                        static_cast<const T*>(d_scalar.GetDeviceBuffer())),
+                    prefetch_enabled);
+            }
+            hip_check_error(hipEventRecord(stop, stream));
+
+            hip_check_error(hipStreamSynchronize(stream));
+
+            float elapsed_ms = 0;
+            hip_check_error(hipEventElapsedTime(&elapsed_ms, start, stop));
+
+            float avg_time_us       = (elapsed_ms * 1000.0f) / num_iterations;
+            float total_bytes       = (num_elements * sizeof(T) + num_scalars * sizeof(T)); // read
+            float bandwidth_gb_s    = (total_bytes / (avg_time_us * 1e-6)) / 1e9;
+            float ops_per_iteration = num_elements * num_scalars; // adds
+            float gflops            = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9;
+
+            std::cout << "  Performance: " << std::endl;
+            std::cout << "    Average kernel time: " << avg_time_us << " us" << std::endl;
+            std::cout << "    Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl;
+            std::cout << "    Compute throughput: " << gflops << " GFLOPS" << std::endl;
+
+            hip_check_error(hipEventDestroy(start));
+            hip_check_error(hipEventDestroy(stop));
+        });
+    }
+    else
+    {
+        prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
+            static_cast<const T*>(d_src.GetDeviceBuffer()),
+            static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                static_cast<const T*>(d_scalar.GetDeviceBuffer())),
+            true);
+
+        hip_check_error(hipStreamSynchronize(stream));
+    }
 
     // Copy results back
     d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data());
@@ -186,7 +237,8 @@ bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel,
     // Verify results
     bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected);
 
-    std::cout << (pass ? "PASS" : "FAIL") << std::endl;
+    std::cout << "  Correctness: " << (pass ? "PASS" : "FAIL") << std::endl;
+    std::cout << std::endl;
 
     hip_check_error(hipStreamDestroy(stream));