diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bfa77b8445..bb80273102 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,6 +31,11 @@ repos:
         entry: projects/composablekernel/script/check_ascii_only.sh
         language: script
         types_or: [c++, inc]
+    -   id: crlf-checker
+        name: Check for CRLF line endings in C/C++ sources
+        entry: projects/composablekernel/script/check_no_crlf.sh
+        language: script
+        types_or: [c++, inc]
     -   id: remove-exec-bit
         name: Remove executable bit from non-executable files
         entry: projects/composablekernel/script/remove_exec_bit.sh
diff --git a/Jenkinsfile b/Jenkinsfile
index bd6b00360f..3197b222ce 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -348,6 +348,24 @@ pipeline {
                         cleanWs()
                     }
                 }
+                stage('CRLF Check') {
+                    agent{ label rocmnode("nogpu") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_cmd = """cd .. && \
+                                find . -type f \\( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.h.in' -o -name '*.hpp.in' -o -name '*.cpp.in' -o -name '*.inc' -o -name '*.cl' \\) \
+                                -not -path '*/build/*' -not -path '*/include/rapidjson/*' \
+                                -print0 | xargs -0 -P 8 -n 64 script/check_no_crlf.sh"""
+                    }
+                    steps{
+                        deleteDir()
+                        script {
+                            loadCk();
+                            ck.buildAndTest(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
+                        }
+                        cleanWs()
+                    }
+                }
             }
         }
          stage("Run Downstream Tests")
diff --git a/include/ck/utility/data_cache_prefetch.hpp b/include/ck/utility/data_cache_prefetch.hpp
index 8eeb726b44..e3e3625381 100644
--- a/include/ck/utility/data_cache_prefetch.hpp
+++ b/include/ck/utility/data_cache_prefetch.hpp
@@ -1,33 +1,33 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include "ck/utility/amd_buffer_coherence.hpp"
-
-namespace ck {
-
-template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
-struct GlobalPrefetchDataOp
-{
-    // addr needs to point to global memory!
-    __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
-    {
-#if defined(__gfx125__)
-        __builtin_amdgcn_global_prefetch(addr, static_cast<index_t>(Coherence_));
-#endif
-    }
-};
-
-template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
-struct FlatPrefetchDataOp
-{
-    __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
-    {
-#if defined(__gfx125__)
-        __builtin_amdgcn_flat_prefetch(addr, static_cast<index_t>(Coherence_));
-#endif
-    }
-};
-
-} // namespace ck
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck/utility/amd_buffer_coherence.hpp"
+
+namespace ck {
+
+template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
+struct GlobalPrefetchDataOp
+{
+    // addr needs to point to global memory!
+    __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
+    {
+#if defined(__gfx125__)
+        __builtin_amdgcn_global_prefetch(addr, static_cast<index_t>(Coherence_));
+#endif
+    }
+};
+
+template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
+struct FlatPrefetchDataOp
+{
+    __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
+    {
+#if defined(__gfx125__)
+        __builtin_amdgcn_flat_prefetch(addr, static_cast<index_t>(Coherence_));
+#endif
+    }
+};
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp
index d3eef8f907..5941ef3179 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp
@@ -1,80 +1,80 @@
-// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
-// SPDX-License-Identifier: MIT
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using BF16 = ck::bhalf_t;
-using F32  = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto GemmDefault      = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr auto BlkGemmPipeSched = ck::BlockGemmPipelineScheduler::Intrawave;
-static constexpr auto BlkGemmPipeVer   = ck::BlockGemmPipelineVersion::v3;
-
-// A[m, k] * B[n, k] = C[m, n] with data cache prefetch support
-template <bool UseDataCachePrefetch>
-using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances = std::tuple<
-    // clang-format off
-    //#########################|ALayout|BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer| KPer | AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|                 |               | Compute | Compute | Permute | Minimum  |         Use         |
-    //#########################|       |       |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|    PipeScheduler|    PipelineVer|  TypeA  |  TypeB  |    A/B  | Occupancy|  DataCachePrefetch  |
-    //#########################|       |       |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                 |               |         |         |         |          |                     |
-    //#########################|       |       |        |      |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                 |               |               |              |               |               |          |            |            |                             |                |                 |               |         |         |         |          |                     |
-    // 128x128x64
-    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   8,   16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>,
-    // 256x128x64
-    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   256,   128,    64,   8,   8,   16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>,
-    // 128x256x64
-    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   256,    64,   8,   8,   16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>,
-    // 256x256x64
-    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   256,   256,    64,   8,   8,   16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>
-    // clang-format on
-    >;
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_prefetch_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances)
-{
-    if(ck::is_gfx125_supported())
-    {
-        add_device_operation_instances(
-            instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<true>{});
-    }
-}
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_no_prefetch_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances)
-{
-    if(ck::is_gfx125_supported())
-    {
-        add_device_operation_instances(
-            instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<false>{});
-    }
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault      = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto BlkGemmPipeSched = ck::BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto BlkGemmPipeVer   = ck::BlockGemmPipelineVersion::v3;
+
+// A[m, k] * B[n, k] = C[m, n] with data cache prefetch support
+template <bool UseDataCachePrefetch>
+using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances = std::tuple<
+    // clang-format off
+    //#########################|ALayout|BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer| KPer | AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|                 |               | Compute | Compute | Permute | Minimum  |         Use         |
+    //#########################|       |       |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|    PipeScheduler|    PipelineVer|  TypeA  |  TypeB  |    A/B  | Occupancy|  DataCachePrefetch  |
+    //#########################|       |       |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                 |               |         |         |         |          |                     |
+    //#########################|       |       |        |      |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                 |               |               |              |               |               |          |            |            |                             |                |                 |               |         |         |         |          |                     |
+    // 128x128x64
+    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   8,   16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>,
+    // 256x128x64
+    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   256,   128,    64,   8,   8,   16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>,
+    // 128x256x64
+    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   256,    64,   8,   8,   16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>,
+    // 256x256x64
+    DeviceGemm_Xdl_CShuffleV3<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   256,   256,    64,   8,   8,   16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,      S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           2,               S<1, 32, 1, 8>,               8, BlkGemmPipeSched, BlkGemmPipeVer,     BF16,     BF16,    false,        0, UseDataCachePrefetch>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_prefetch_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx125_supported())
+    {
+        add_device_operation_instances(
+            instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<true>{});
+    }
+}
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_no_prefetch_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx125_supported())
+    {
+        add_device_operation_instances(
+            instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<false>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/script/check_no_crlf.sh b/script/check_no_crlf.sh
new file mode 100755
index 0000000000..514213fad2
--- /dev/null
+++ b/script/check_no_crlf.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# Rejects Windows CRLF line endings (a trailing carriage return) in the
+# files passed as arguments. Used both by the local pre-commit hook and
+# by the Jenkinsfile "CRLF Check" static-check stage.
+#
+# Usage: ./check_no_crlf.sh <file1> <file2> ...
+
+exit_code=0
+
+for file in "$@"; do
+    [[ -f "$file" ]] || continue
+    if LC_ALL=C grep -qP '\r$' "$file" 2>/dev/null; then
+        echo "ERROR: $file contains CRLF (Windows) line endings:"
+        LC_ALL=C grep -nP '\r$' "$file" | head -20 | sed 's/\r$/<CR>/'
+        echo "  Fix: convert to LF, e.g. 'sed -i 's/\\r\$//' $file' or 'dos2unix $file'"
+        exit_code=1
+    fi
+done
+
+exit $exit_code
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp
index e7ba00e4ef..7375f43f1d 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp
@@ -1,31 +1,31 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_quant_common.hpp"
-
-using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantPreshuffleQuantTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<false>, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<true>, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    using BQuantGroupSize = std::tuple_element_t<11, TypeParam>;
-    if(ck_tile::is_gfx120_supported() && std::is_same_v<BQuantGroupSize, GroupSize2D128N>)
-    {
-        GTEST_SKIP() << "temp disable due to random fail on gfx120.";
-    }
-
-    this->run_test_with_validation(1024, 1024, 1024);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantPreshuffleQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<false>, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<true>, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    using BQuantGroupSize = std::tuple_element_t<11, TypeParam>;
+    if(ck_tile::is_gfx120_supported() && std::is_same_v<BQuantGroupSize, GroupSize2D128N>)
+    {
+        GTEST_SKIP() << "temp disable due to random fail on gfx120.";
+    }
+
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/prefetch_op/prefetch_op.cpp b/test/prefetch_op/prefetch_op.cpp
index 66cc6ba9c7..a3bc8db411 100644
--- a/test/prefetch_op/prefetch_op.cpp
+++ b/test/prefetch_op/prefetch_op.cpp
@@ -1,74 +1,74 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck/ck.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-#include "prefetch_op_util.hpp"
-
-template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, bool IS_L1_PREFETCH>
-bool run_test(bool time_kernels)
-{
-    bool pass = true;
-
-#if defined(__gfx125__)
-    const auto coherence =
-        IS_L1_PREFETCH ? ck::AmdBufferCoherenceEnum::CU_RT : ck::AmdBufferCoherenceEnum::SE_RT;
-    using global_prefetch_op = ck::GlobalPrefetchDataOp<coherence>;
-    using flat_prefetch_op   = ck::FlatPrefetchDataOp<coherence>;
-#else
-    using global_prefetch_op = ck::GlobalPrefetchDataOp<>;
-    using flat_prefetch_op   = ck::FlatPrefetchDataOp<>;
-#endif
-
-    const auto global_prefetch_kernel =
-        ck::prefetch_op_util::kernel_with_prefetch<T, NUM_THREADS, NUM_SCALARS, global_prefetch_op>;
-    const auto flat_prefetch_kernel = ck::prefetch_op_util::
-        kernel_with_prefetch_and_shared_mem<T, NUM_THREADS, NUM_SCALARS, flat_prefetch_op>;
-
-    const auto prefetch_kernel_container =
-        std::make_tuple(global_prefetch_kernel, flat_prefetch_kernel);
-
-    ck::static_for<0, 2, 1>{}([&](auto i) {
-        std::string kernel_name = (i == 1 ? "flat_prefetch" : "global_prefetch");
-
-        auto kernel = std::get<ck::Number<i>{}>(prefetch_kernel_container);
-
-        pass &=
-            ck::prefetch_op_util::test_prefetch_impl<decltype(kernel), T, NUM_THREADS, NUM_SCALARS>(
-                time_kernels, kernel, kernel_name);
-    });
-
-    return pass;
-}
-
-int main(int argc, char* argv[])
-{
-    if(!ck::is_gfx125_supported())
-    {
-        std::cout << "This feature is not supported by current HW, skipping tests." << std::endl;
-        return 0;
-    }
-
-    bool time_kernels = false;
-
-    if(argc == 2)
-    {
-        time_kernels = std::stoi(argv[1]);
-    }
-
-    bool pass = true;
-
-    std::cout << "=== Testing L2 Global Cache Prefetch ===" << std::endl;
-
-    pass &= run_test<float, 4096, 1024, false>(time_kernels);
-    pass &= run_test<double, 4096, 512, false>(time_kernels);
-
-    std::cout << "=== Testing L1 Global Cache Prefetch ===" << std::endl;
-
-    pass &= run_test<float, 4096, 1024, true>(time_kernels);
-    pass &= run_test<double, 4096, 512, true>(time_kernels);
-
-    std::cout << "TestGlobalPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
-    return pass ? 0 : 1;
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/ck.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+#include "prefetch_op_util.hpp"
+
+template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, bool IS_L1_PREFETCH>
+bool run_test(bool time_kernels)
+{
+    bool pass = true;
+
+#if defined(__gfx125__)
+    const auto coherence =
+        IS_L1_PREFETCH ? ck::AmdBufferCoherenceEnum::CU_RT : ck::AmdBufferCoherenceEnum::SE_RT;
+    using global_prefetch_op = ck::GlobalPrefetchDataOp<coherence>;
+    using flat_prefetch_op   = ck::FlatPrefetchDataOp<coherence>;
+#else
+    using global_prefetch_op = ck::GlobalPrefetchDataOp<>;
+    using flat_prefetch_op   = ck::FlatPrefetchDataOp<>;
+#endif
+
+    const auto global_prefetch_kernel =
+        ck::prefetch_op_util::kernel_with_prefetch<T, NUM_THREADS, NUM_SCALARS, global_prefetch_op>;
+    const auto flat_prefetch_kernel = ck::prefetch_op_util::
+        kernel_with_prefetch_and_shared_mem<T, NUM_THREADS, NUM_SCALARS, flat_prefetch_op>;
+
+    const auto prefetch_kernel_container =
+        std::make_tuple(global_prefetch_kernel, flat_prefetch_kernel);
+
+    ck::static_for<0, 2, 1>{}([&](auto i) {
+        std::string kernel_name = (i == 1 ? "flat_prefetch" : "global_prefetch");
+
+        auto kernel = std::get<ck::Number<i>{}>(prefetch_kernel_container);
+
+        pass &=
+            ck::prefetch_op_util::test_prefetch_impl<decltype(kernel), T, NUM_THREADS, NUM_SCALARS>(
+                time_kernels, kernel, kernel_name);
+    });
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx125_supported())
+    {
+        std::cout << "This feature is not supported by current HW, skipping tests." << std::endl;
+        return 0;
+    }
+
+    bool time_kernels = false;
+
+    if(argc == 2)
+    {
+        time_kernels = std::stoi(argv[1]);
+    }
+
+    bool pass = true;
+
+    std::cout << "=== Testing L2 Global Cache Prefetch ===" << std::endl;
+
+    pass &= run_test<float, 4096, 1024, false>(time_kernels);
+    pass &= run_test<double, 4096, 512, false>(time_kernels);
+
+    std::cout << "=== Testing L1 Global Cache Prefetch ===" << std::endl;
+
+    pass &= run_test<float, 4096, 1024, true>(time_kernels);
+    pass &= run_test<double, 4096, 512, true>(time_kernels);
+
+    std::cout << "TestGlobalPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/prefetch_op/prefetch_op_util.hpp b/test/prefetch_op/prefetch_op_util.hpp
index 47ecbc3d36..790eb2ab98 100644
--- a/test/prefetch_op/prefetch_op_util.hpp
+++ b/test/prefetch_op/prefetch_op_util.hpp
@@ -1,276 +1,276 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck/utility/common_header.hpp"
-
-#include "ck/ck.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/host_utility/hip_check_error.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/host_utility/flush_cache.hpp"
-
-#include <hip/hip_runtime.h>
-
-#include "ck/utility/data_cache_prefetch.hpp"
-
-namespace ck {
-namespace prefetch_op_util {
-
-template <typename T>
-struct KernelArgs
-{
-    const T* p_a_grid;
-    T* dst;
-    const T* p_b_grid;
-    bool enable_prefetch;
-};
-
-template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
-__global__ void kernel_with_prefetch(KernelArgs<T> args)
-{
-    const T* src         = args.p_a_grid;
-    T* dst               = args.dst;
-    const T* scalar_data = args.p_b_grid;
-    bool enable_prefetch = args.enable_prefetch;
-
-    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // Calculate number of 32B cachelines needed to cover num_scalars elements
-    constexpr index_t cachelineSize              = 32;
-    constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
-    constexpr unsigned int cachelinesNeeded =
-        (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
-
-    const char* byte_addr = reinterpret_cast<const char*>(scalar_data);
-
-    // Prefetch all scalar data at once
-    if(tid < cachelinesNeeded)
-    {
-        if(enable_prefetch)
-        {
-            // Prefetch the cacheline
-            PrefetchOp{}(byte_addr + tid * cachelineSize);
-        }
-    }
-
-    T sum = 0;
-    if(tid < NUM_THREADS)
-    {
-        sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
-                        // finish
-    }
-    __syncthreads(); // waits on loads from global mem
-    if(tid < NUM_THREADS)
-    {
-        // Access prefetched scalar data
-        for(uint32_t i = 0; i < NUM_SCALARS; i++)
-        {
-            sum += scalar_data[i]; // should be fast due to scalars being preloaded
-        }
-
-        dst[tid] = sum;
-    }
-}
-
-template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
-__global__ void kernel_with_prefetch_and_shared_mem(KernelArgs<T> args)
-{
-    const T* src         = args.p_a_grid;
-    T* dst               = args.dst;
-    const T* scalar_data = args.p_b_grid;
-    bool enable_prefetch = args.enable_prefetch;
-
-    __shared__ T sharedMem[32];
-
-    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // Calculate number of 32B cachelines needed to cover num_scalars elements
-    constexpr index_t cachelineSize              = 32;
-    constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
-    constexpr unsigned int cachelinesNeeded =
-        (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
-
-    bool use_shared_mem = tid % 2 == 1;
-
-    const void* byte_addr;
-    if(use_shared_mem)
-    {
-        byte_addr = reinterpret_cast<const void*>(sharedMem);
-    }
-    else
-    {
-        uintptr_t base   = reinterpret_cast<uintptr_t>(scalar_data);
-        uintptr_t offset = base + (tid / 2) * cachelineSize;
-        byte_addr        = reinterpret_cast<const void*>(offset);
-    }
-
-    // Prefetch all scalar data at once
-    if(tid < cachelinesNeeded * 2)
-    {
-        if(enable_prefetch)
-        {
-            // Prefetch the cacheline
-            PrefetchOp{}(byte_addr);
-        }
-        else
-        {
-            (void)byte_addr;
-        }
-    }
-
-    T sum = 0;
-    if(tid < NUM_THREADS)
-    {
-        sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
-                        // finish
-    }
-    __syncthreads(); // waits on loads from global mem
-    if(tid < NUM_THREADS)
-    {
-        // Access prefetched scalar data
-        for(uint32_t i = 0; i < NUM_SCALARS; i++)
-        {
-            sum += scalar_data[i]; // should be fast due to scalars being preloaded
-        }
-
-        dst[tid] = sum;
-    }
-}
-
-template <typename PrefetchKernel, typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
-bool test_prefetch_impl(bool time_kernels,
-                        const PrefetchKernel& prefetch_kernel,
-                        const std::string& kernel_name)
-{
-    constexpr index_t block_size   = 256;
-    constexpr index_t num_elements = NUM_THREADS;
-    constexpr index_t num_scalars  = NUM_SCALARS;
-
-    // TODO: maybe add more prefetch instructions inside kernel to support more values
-    assert(NUM_SCALARS / sizeof(T) < (32 * block_size) &&
-           "Too many scalars to prefetch with current implementation!");
-
-    constexpr index_t grid_size = (num_elements + block_size - 1) / block_size;
-
-    std::cout << "Testing " << kernel_name << " for type: " << typeid(T).name() << std::endl;
-    std::cout << "Elements: " << num_elements << ", Scalars: " << num_scalars << std::endl;
-
-    // Host data
-    std::vector<T> h_src(num_elements);
-    std::vector<T> h_scalar(num_scalars);
-    std::vector<T> h_dst_with_prefetch_chunks(num_elements);
-    std::vector<T> h_expected(num_elements);
-
-    // Initialize data
-    for(index_t i = 0; i < num_elements; i++)
-    {
-        h_src[i] = static_cast<T>(i % 100);
-    }
-
-    T scalar_sum = 0;
-    for(index_t i = 0; i < num_scalars; i++)
-    {
-        h_scalar[i] = static_cast<T>(i + 1);
-        scalar_sum += h_scalar[i];
-    }
-
-    // Expected results
-    for(index_t i = 0; i < num_elements; i++)
-    {
-        h_expected[i] = h_src[i] + scalar_sum;
-    }
-
-    // Device memory
-    DeviceMem d_src(sizeof(T) * num_elements);
-    DeviceMem d_scalar(sizeof(T) * num_scalars);
-    DeviceMem d_dst_with_prefetch_chunks(sizeof(T) * num_elements);
-
-    d_src.ToDevice(h_src.data());
-    d_scalar.ToDevice(h_scalar.data());
-
-    KernelArgs<T> args{static_cast<const T*>(d_src.GetDeviceBuffer()),
-                       static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
-                       static_cast<const T*>(d_scalar.GetDeviceBuffer()),
-                       true};
-    if(time_kernels)
-    {
-        std::array<float, 2> avg_times_us;
-        ck::static_for<0, 2, 1>{}([&](auto static_i) {
-            constexpr bool prefetch_enabled = static_i == 0;
-            std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl;
-
-            args.enable_prefetch = prefetch_enabled;
-
-            constexpr int num_warmup     = 1;
-            constexpr int num_iterations = 10;
-            constexpr int rotating_count = num_iterations;
-            auto size_a_buffer           = d_src.GetBufferSize();
-            auto size_b_buffer           = d_scalar.GetBufferSize();
-
-            ck::utility::RotatingMemWrapper<KernelArgs<T>> rotating_mem(
-                args, rotating_count, size_a_buffer, size_b_buffer);
-            rotating_mem.Print();
-
-            auto run_flush_cache = [&]() {
-                // flush icache
-                ck::utility::flush_icache();
-                // rotating mem
-                rotating_mem.Next();
-            };
-            float avg_time_ms = ck::utility::launch_and_time_kernel_with_preprocess<false>(
-                StreamConfig{nullptr, true, 0, num_warmup, num_iterations, true, rotating_count},
-                run_flush_cache,
-                prefetch_kernel,
-                dim3(grid_size),
-                dim3(block_size),
-                0,
-                args);
-
-            float avg_time_us       = avg_time_ms * 1000.0f;
-            float total_bytes       = (size_a_buffer + size_b_buffer); // read
-            float bandwidth_gb_s    = (total_bytes / (avg_time_us * 1e-6)) / 1e9;
-            float ops_per_iteration = num_elements * num_scalars; // adds
-            float gflops            = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9;
-
-            std::cout << "  Performance: " << std::endl;
-            std::cout << "    Average kernel time: " << avg_time_us << " us" << std::endl;
-            std::cout << "    Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl;
-            std::cout << "    Compute throughput: " << gflops << " GFLOPS" << std::endl;
-
-            avg_times_us[static_i] = avg_time_us;
-        });
-
-        float speedup = avg_times_us[1] / avg_times_us[0];
-
-        std::cout << "On average kernel with prefetch is " << speedup
-                  << " times faster than without prefetch." << std::endl;
-
-        if(speedup < 1.0f)
-            std::cout << "WARNING: prefetch kernel is slower!" << std::endl;
-    }
-    else
-    {
-        launch_and_time_kernel(StreamConfig{nullptr, false},
-                               prefetch_kernel,
-                               dim3(grid_size),
-                               dim3(block_size),
-                               0, // lds_byte
-                               args);
-    }
-
-    // Copy results back
-    d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data());
-
-    // Verify results
-    bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected);
-
-    std::cout << "  Correctness: " << (pass ? "PASS" : "FAIL") << std::endl;
-    std::cout << std::endl;
-
-    return pass;
-}
-
-} // namespace prefetch_op_util
-} // namespace ck
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include "ck/utility/data_cache_prefetch.hpp"
+
+namespace ck {
+namespace prefetch_op_util {
+
+template <typename T>
+struct KernelArgs
+{
+    const T* p_a_grid;
+    T* dst;
+    const T* p_b_grid;
+    bool enable_prefetch;
+};
+
+template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
+__global__ void kernel_with_prefetch(KernelArgs<T> args)
+{
+    const T* src         = args.p_a_grid;
+    T* dst               = args.dst;
+    const T* scalar_data = args.p_b_grid;
+    bool enable_prefetch = args.enable_prefetch;
+
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Calculate number of 32B cachelines needed to cover num_scalars elements
+    constexpr index_t cachelineSize              = 32;
+    constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
+    constexpr unsigned int cachelinesNeeded =
+        (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
+
+    const char* byte_addr = reinterpret_cast<const char*>(scalar_data);
+
+    // Prefetch all scalar data at once
+    if(tid < cachelinesNeeded)
+    {
+        if(enable_prefetch)
+        {
+            // Prefetch the cacheline
+            PrefetchOp{}(byte_addr + tid * cachelineSize);
+        }
+    }
+
+    T sum = 0;
+    if(tid < NUM_THREADS)
+    {
+        sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
+                        // finish
+    }
+    __syncthreads(); // waits on loads from global mem
+    if(tid < NUM_THREADS)
+    {
+        // Access prefetched scalar data
+        for(uint32_t i = 0; i < NUM_SCALARS; i++)
+        {
+            sum += scalar_data[i]; // should be fast due to scalars being preloaded
+        }
+
+        dst[tid] = sum;
+    }
+}
+
+template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
+__global__ void kernel_with_prefetch_and_shared_mem(KernelArgs<T> args)
+{
+    const T* src         = args.p_a_grid;
+    T* dst               = args.dst;
+    const T* scalar_data = args.p_b_grid;
+    bool enable_prefetch = args.enable_prefetch;
+
+    __shared__ T sharedMem[32];
+
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Calculate number of 32B cachelines needed to cover num_scalars elements
+    constexpr index_t cachelineSize              = 32;
+    constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
+    constexpr unsigned int cachelinesNeeded =
+        (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
+
+    bool use_shared_mem = tid % 2 == 1;
+
+    const void* byte_addr;
+    if(use_shared_mem)
+    {
+        byte_addr = reinterpret_cast<const void*>(sharedMem);
+    }
+    else
+    {
+        uintptr_t base   = reinterpret_cast<uintptr_t>(scalar_data);
+        uintptr_t offset = base + (tid / 2) * cachelineSize;
+        byte_addr        = reinterpret_cast<const void*>(offset);
+    }
+
+    // Prefetch all scalar data at once
+    if(tid < cachelinesNeeded * 2)
+    {
+        if(enable_prefetch)
+        {
+            // Prefetch the cacheline
+            PrefetchOp{}(byte_addr);
+        }
+        else
+        {
+            (void)byte_addr;
+        }
+    }
+
+    T sum = 0;
+    if(tid < NUM_THREADS)
+    {
+        sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
+                        // finish
+    }
+    __syncthreads(); // waits on loads from global mem
+    if(tid < NUM_THREADS)
+    {
+        // Access prefetched scalar data
+        for(uint32_t i = 0; i < NUM_SCALARS; i++)
+        {
+            sum += scalar_data[i]; // should be fast due to scalars being preloaded
+        }
+
+        dst[tid] = sum;
+    }
+}
+
+template <typename PrefetchKernel, typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
+bool test_prefetch_impl(bool time_kernels,
+                        const PrefetchKernel& prefetch_kernel,
+                        const std::string& kernel_name)
+{
+    constexpr index_t block_size   = 256;
+    constexpr index_t num_elements = NUM_THREADS;
+    constexpr index_t num_scalars  = NUM_SCALARS;
+
+    // TODO: maybe add more prefetch instructions inside kernel to support more values
+    assert(NUM_SCALARS / sizeof(T) < (32 * block_size) &&
+           "Too many scalars to prefetch with current implementation!");
+
+    constexpr index_t grid_size = (num_elements + block_size - 1) / block_size;
+
+    std::cout << "Testing " << kernel_name << " for type: " << typeid(T).name() << std::endl;
+    std::cout << "Elements: " << num_elements << ", Scalars: " << num_scalars << std::endl;
+
+    // Host data
+    std::vector<T> h_src(num_elements);
+    std::vector<T> h_scalar(num_scalars);
+    std::vector<T> h_dst_with_prefetch_chunks(num_elements);
+    std::vector<T> h_expected(num_elements);
+
+    // Initialize data
+    for(index_t i = 0; i < num_elements; i++)
+    {
+        h_src[i] = static_cast<T>(i % 100);
+    }
+
+    T scalar_sum = 0;
+    for(index_t i = 0; i < num_scalars; i++)
+    {
+        h_scalar[i] = static_cast<T>(i + 1);
+        scalar_sum += h_scalar[i];
+    }
+
+    // Expected results
+    for(index_t i = 0; i < num_elements; i++)
+    {
+        h_expected[i] = h_src[i] + scalar_sum;
+    }
+
+    // Device memory
+    DeviceMem d_src(sizeof(T) * num_elements);
+    DeviceMem d_scalar(sizeof(T) * num_scalars);
+    DeviceMem d_dst_with_prefetch_chunks(sizeof(T) * num_elements);
+
+    d_src.ToDevice(h_src.data());
+    d_scalar.ToDevice(h_scalar.data());
+
+    KernelArgs<T> args{static_cast<const T*>(d_src.GetDeviceBuffer()),
+                       static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
+                       static_cast<const T*>(d_scalar.GetDeviceBuffer()),
+                       true};
+    if(time_kernels)
+    {
+        std::array<float, 2> avg_times_us;
+        ck::static_for<0, 2, 1>{}([&](auto static_i) {
+            constexpr bool prefetch_enabled = static_i == 0;
+            std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl;
+
+            args.enable_prefetch = prefetch_enabled;
+
+            constexpr int num_warmup     = 1;
+            constexpr int num_iterations = 10;
+            constexpr int rotating_count = num_iterations;
+            auto size_a_buffer           = d_src.GetBufferSize();
+            auto size_b_buffer           = d_scalar.GetBufferSize();
+
+            ck::utility::RotatingMemWrapper<KernelArgs<T>> rotating_mem(
+                args, rotating_count, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck::utility::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+            };
+            float avg_time_ms = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                StreamConfig{nullptr, true, 0, num_warmup, num_iterations, true, rotating_count},
+                run_flush_cache,
+                prefetch_kernel,
+                dim3(grid_size),
+                dim3(block_size),
+                0,
+                args);
+
+            float avg_time_us       = avg_time_ms * 1000.0f;
+            float total_bytes       = (size_a_buffer + size_b_buffer); // read
+            float bandwidth_gb_s    = (total_bytes / (avg_time_us * 1e-6)) / 1e9;
+            float ops_per_iteration = num_elements * num_scalars; // adds
+            float gflops            = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9;
+
+            std::cout << "  Performance: " << std::endl;
+            std::cout << "    Average kernel time: " << avg_time_us << " us" << std::endl;
+            std::cout << "    Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl;
+            std::cout << "    Compute throughput: " << gflops << " GFLOPS" << std::endl;
+
+            avg_times_us[static_i] = avg_time_us;
+        });
+
+        float speedup = avg_times_us[1] / avg_times_us[0];
+
+        std::cout << "On average kernel with prefetch is " << speedup
+                  << " times faster than without prefetch." << std::endl;
+
+        if(speedup < 1.0f)
+            std::cout << "WARNING: prefetch kernel is slower!" << std::endl;
+    }
+    else
+    {
+        launch_and_time_kernel(StreamConfig{nullptr, false},
+                               prefetch_kernel,
+                               dim3(grid_size),
+                               dim3(block_size),
+                               0, // lds_byte
+                               args);
+    }
+
+    // Copy results back
+    d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data());
+
+    // Verify results
+    bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected);
+
+    std::cout << "  Correctness: " << (pass ? "PASS" : "FAIL") << std::endl;
+    std::cout << std::endl;
+
+    return pass;
+}
+
+} // namespace prefetch_op_util
+} // namespace ck
diff --git a/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp b/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp
index 484ead5c3e..f072d5f8f7 100644
--- a/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp
+++ b/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp
@@ -1,39 +1,39 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck/ck.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-#include "s_prefetch_inst_op_util.hpp"
-
-template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
-bool run_test(bool time_kernels)
-{
-    return ck::s_prefetch_inst_op_util::test_inst_prefetch_impl<T, NUM_THREADS, NUM_SCALARS>(
-        time_kernels, "s_prefetch_inst_pc_rel");
-}
-
-int main(int argc, char* argv[])
-{
-    if(!ck::is_gfx12_supported())
-    {
-        std::cout << "instruction cache prefetch is not supported by current HW, skipping tests."
-                  << std::endl;
-        return 0;
-    }
-
-    bool time_kernels = false;
-    if(argc == 2)
-    {
-        time_kernels = std::stoi(argv[1]);
-    }
-
-    bool pass = true;
-
-    std::cout << "=== Testing Instruction Prefetch ===" << std::endl;
-
-    pass &= run_test<float, 4096, 16384>(time_kernels);
-
-    std::cout << "TestInstPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
-    return pass ? 0 : 1;
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/ck.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+#include "s_prefetch_inst_op_util.hpp"
+
+template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
+bool run_test(bool time_kernels)
+{
+    return ck::s_prefetch_inst_op_util::test_inst_prefetch_impl<T, NUM_THREADS, NUM_SCALARS>(
+        time_kernels, "s_prefetch_inst_pc_rel");
+}
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "instruction cache prefetch is not supported by current HW, skipping tests."
+                  << std::endl;
+        return 0;
+    }
+
+    bool time_kernels = false;
+    if(argc == 2)
+    {
+        time_kernels = std::stoi(argv[1]);
+    }
+
+    bool pass = true;
+
+    std::cout << "=== Testing Instruction Prefetch ===" << std::endl;
+
+    pass &= run_test<float, 4096, 16384>(time_kernels);
+
+    std::cout << "TestInstPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+    return pass ? 0 : 1;
+}