From 7e44b845b5dd4bcc28d55b4b2764e2be6418a35a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Pietil=C3=A4?=
 <188998872+vpietila-amd@users.noreply.github.com>
Date: Fri, 17 Oct 2025 15:36:39 +0300
Subject: [PATCH 1/7] Fixed handling of split-K autodeduce argument for grouped
 convolution (#3024)

* Fix handling of split-K autodeduce argument.

* Fix clang formatting.

* Test fix.

* Fix clang formatting.
---
 ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp |  6 +++++
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp |  8 ++++++
 ...ce_grouped_gemm_multi_abd_xdl_fixed_nk.hpp |  6 +++++
 ..._grouped_convnd_bwd_data_interface_xdl.cpp | 27 +++++++++++++++++--
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index ff652ebefb..febb037157 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -689,6 +689,12 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
             return false;
         }
 
+        // Split-K autodeduction is not supported
+        if(arg.k_batch_ < 1)
+        {
+            return false;
+        }
+
         // Gridwise GEMM size
         return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
                                            arg.b_grid_desc_kbatch_k0_n_k1_,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 47832e2153..4672de3504 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -1523,6 +1523,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                 return false;
             }
         }
+        else
+        {
+            // Split-K autodeduction is not supported.
+            if(arg.k_batch_ < 1)
+            {
+                return false;
+            }
+        }
 
         const index_t ConvG               = arg.b_g_k_c_xs_lengths_[0];
         const index_t ConvK               = arg.b_g_k_c_xs_lengths_[1];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
index f6ec0908eb..d5d48777a0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
@@ -688,6 +688,12 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
 
     static bool IsSupportedArgument(const Argument& arg)
     {
+        // Split-K autodeduction is not supported
+        if(arg.k_batch_ < 1)
+        {
+            return false;
+        }
+
         if(ck::type_convert<ck::index_t>(arg.gemm_desc_kernel_arg_.size()) != arg.group_count_)
         {
             return false;
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
index 01f4260c43..7903c17b22 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
@@ -47,10 +47,11 @@ class TestGroupedConvndBwdData : public ::testing::Test
     // ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
     // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
     // ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-    < NDimSpatial, OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType,   ck::Tuple<>, DataType, Pass, Pass,    Pass,  ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   16,   16,       4,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>;
+              < NDimSpatial, OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType,   ck::Tuple<>, DataType, Pass, Pass,    Pass,  ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   16,   16,       4,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>;
     // clang-format on
 
     ck::utils::conv::ConvParam conv_param;
+    ck::index_t split_k{1};
 
     template <ck::index_t NDimSpatial>
     bool Run()
@@ -112,7 +113,8 @@ class TestGroupedConvndBwdData : public ::testing::Test
                                           input_right_pads,
                                           Pass{},
                                           Pass{},
-                                          Pass{});
+                                          Pass{},
+                                          split_k);
         return conv.IsSupportedArgument(argument);
     }
 };
@@ -176,3 +178,24 @@ TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck)
     is_supported     = this->template Run<2>();
     EXPECT_FALSE(is_supported);
 }
+
+TYPED_TEST(TestGroupedConvndBwdDataDefault, SplitK)
+{
+    if(ck::is_xdl_supported())
+    {
+        // SplitK = 1
+        this->conv_param  = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+        this->split_k     = 1;
+        bool is_supported = this->template Run<2>();
+        EXPECT_TRUE(is_supported);
+
+        // Split-K autodeduce
+        this->split_k = -1;
+        is_supported  = this->template Run<2>();
+        EXPECT_FALSE(is_supported);
+    }
+    else
+    {
+        GTEST_SKIP() << "XDL ops not supported on this device";
+    }
+}

From 8a4cd32d8692c54a3a500ec65d2623c9d27bd7f5 Mon Sep 17 00:00:00 2001
From: Johannes Graner <johannes.graner@amd.com>
Date: Fri, 17 Oct 2025 18:28:38 +0200
Subject: [PATCH 2/7] Pre-commit in CI (#3029)

* Pre-commit in CI

* Specify python version, and install dos2unix for remod

* Refactor remod hook to correctly install dependencies

* Run pre-commit
---
 .github/workflows/pre-commit.yml | 16 ++++++++++++++++
 .pre-commit-config.yaml          | 11 +++++++----
 example/ck_tile/remod.py         |  9 +++++++--
 include/ck_tile/ops/gemm.hpp     |  3 ++-
 include/ck_tile/remod.py         |  8 ++++++--
 script/install_precommit.sh      |  3 ---
 script/remod_for_ck_tile.py      | 13 +++++++++++++
 script/remod_for_ck_tile.sh      |  7 -------
 8 files changed, 51 insertions(+), 19 deletions(-)
 create mode 100644 .github/workflows/pre-commit.yml
 create mode 100755 script/remod_for_ck_tile.py
 delete mode 100755 script/remod_for_ck_tile.sh

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000000..16f7e2539c
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,16 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [develop]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v3
+      with:
+        python-version: '3.12'
+    - uses: pre-commit/action@v3.0.1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 03d33757b0..04ebc6b45a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,9 +32,12 @@ repos:
         language: script
         types_or: [c++, text]
         verbose: true
-    -   id: run-remod-if-ck-tile-changed
-        name: Run remod.py if ck_tile files changed
-        entry: script/remod_for_ck_tile.sh
-        language: script
+    -   id: remod-ck-tile
+        name: Run ck_tile remod.py
+        entry: python script/remod_for_ck_tile.py
+        language: python
         files: '^(include|example)/ck_tile/.*$'
+        additional_dependencies:
+        - dos2unix
+        - clang-format==18.1.3
         pass_filenames: false
diff --git a/example/ck_tile/remod.py b/example/ck_tile/remod.py
index b2ac7c52bf..4fa3a4e430 100644
--- a/example/ck_tile/remod.py
+++ b/example/ck_tile/remod.py
@@ -1,3 +1,4 @@
+import os
 import pathlib
 from pathlib import Path
 import subprocess
@@ -10,8 +11,12 @@ for p in sorted(Path("./").rglob("*")):
 
 # formatting
 for x in all_files:
-    subprocess.Popen(f"dos2unix -n {str(x)}", shell=True)
-    cmd = f"clang-format-18 -style=file -i {str(x)}"
+    subprocess.Popen(
+        f"python -m dos2unix {str(x)} {str(x)}",
+        shell=True,
+        stdout=open(os.devnull, "wb"),
+    )
+    cmd = f"clang-format -style=file -i {str(x)}"
     # for xp in x.parents:
     # print(get_file_base(x))
     subprocess.Popen(cmd, shell=True)
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 6b587f81d5..e1026485d7 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -33,9 +33,10 @@
 #include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
-#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp"
 #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"
diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py
index bd940036bd..a8ff2defe5 100644
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -86,8 +86,12 @@ class submodule_t:
 submodule = submodule_t()
 # formatting
 for x in all_files:
-    subprocess.Popen(f"dos2unix -n {str(x)}", shell=True)
-    cmd = f"clang-format-18 -style=file -i {str(x)}"
+    subprocess.Popen(
+        f"python -m dos2unix {str(x)} {str(x)}",
+        shell=True,
+        stdout=open(os.devnull, "wb"),
+    )
+    cmd = f"clang-format -style=file -i {str(x)}"
     # for xp in x.parents:
     # print(get_file_base(x))
     subprocess.Popen(cmd, shell=True)
diff --git a/script/install_precommit.sh b/script/install_precommit.sh
index fd1840290e..545dcfa666 100755
--- a/script/install_precommit.sh
+++ b/script/install_precommit.sh
@@ -13,9 +13,6 @@ echo "I: Creating and activating virtual environment for pre-commit..."
 python3 -m venv "$(dirname "$0")/../.venv"
 source "$(dirname "$0")/../.venv/bin/activate"
 
-echo "I: Installing tools required for pre-commit checks..."
-run_and_check pip install dos2unix
-run_and_check pip install clang-format==18.1.3
 echo "I: Installing pre-commit in virtual environment..."
 run_and_check pip install pre-commit
 run_and_check pre-commit install
diff --git a/script/remod_for_ck_tile.py b/script/remod_for_ck_tile.py
new file mode 100755
index 0000000000..7601c9d619
--- /dev/null
+++ b/script/remod_for_ck_tile.py
@@ -0,0 +1,13 @@
+import os
+
+root_dir = os.getcwd()
+ck_tile_include = root_dir + "/include/ck_tile"
+ck_tile_example = root_dir + "/example/ck_tile"
+
+# Run for include
+os.chdir(ck_tile_include)
+_ = os.system("python remod.py")
+
+# Run for example
+os.chdir(ck_tile_example)
+_ = os.system("python remod.py")
diff --git a/script/remod_for_ck_tile.sh b/script/remod_for_ck_tile.sh
deleted file mode 100755
index 7b99ec60bd..0000000000
--- a/script/remod_for_ck_tile.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-# Run remod.py in both required locations
-(cd include/ck_tile/ && python3 remod.py)
-(cd example/ck_tile/ && python3 remod.py)

From 352dee5225cede21e82bb96f530425e54139f251 Mon Sep 17 00:00:00 2001
From: Emily Martins <65371150+ecamartins@users.noreply.github.com>
Date: Fri, 17 Oct 2025 10:33:38 -0600
Subject: [PATCH 3/7] Fix CK Tile Stream-K BF16 Validation Errors (#3039)

Prior to this change, the number of accumulations passed into
calculate_rtol_atol was 1. That said, in most cases, this is not correct
when there are multiple workgroups contributing to the same macro tile
in C.

This change ensures uses the function estimate_num_wgs_per_tile, which
was extracted into a common file and generalized, to estimate the number
of workgroups per macro tile. This estimate is passed into
calculate_rtol_atol to ensure we get a better relative and absolute
tolerance.
---
 .../40_streamk_gemm/run_gemm_example.inc      | 57 ++++++-------------
 .../40_streamk_gemm/streamk_gemm_basic.cpp    | 14 ++++-
 include/ck_tile/ops/common/streamk_common.hpp | 29 ++++++++++
 .../gemm_streamk/test_gemm_streamk.hpp        | 31 +++++++---
 4 files changed, 80 insertions(+), 51 deletions(-)

diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
index 5fdf6b29ef..6dd054ee11 100644
--- a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
+++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
@@ -2,29 +2,6 @@
 // SPDX-License-Identifier:  MIT
 #pragma once
 
-// Estimate the number of WGs contributing to the same macro tile in C
-template <ck_tile::StreamKReductionStrategy ReductionStrategy, typename TilePartitioner>
-int estimate_num_wgs_per_tile(const TilePartitioner& tile_partitioner)
-{
-    // In the case of non-atomic reduction or DP only, there will always be 1 WG contributing to a
-    // macro time in C
-    int num_wgs_per_tile = 1;
-
-    // Otherwise, for atomics, multiple WGs may be contributing to the same macro tile in C
-    if(tile_partitioner.sk_num_blocks > 0 &&
-       ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
-    {
-        // Determine the number of iterations per WG for a given macro tile in C
-        uint32_t k_iters_per_block = tile_partitioner.k_iters_per_big_block - 1;
-
-        // Estimate the number of WGs per macro tile
-        num_wgs_per_tile = (tile_partitioner.k_iters_per_tile.get() / (k_iters_per_block)) +
-                           ((tile_partitioner.k_iters_per_tile.get() % k_iters_per_block) != 0);
-    }
-
-    return std::max(num_wgs_per_tile, 1);
-}
-
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout)
 {
@@ -65,7 +42,8 @@ template <typename GemmConfig,
           typename CLayout,
           typename CDEElementWise = ck_tile::element_wise::PassThrough,
           ck_tile::StreamKReductionStrategy ReductionStrategy>
-std::tuple<float, int> gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s);
+std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
+                                         const ck_tile::stream_config& s);
 
 template <typename GemmConfig,
           typename ADataType,
@@ -78,20 +56,21 @@ template <typename GemmConfig,
           typename DsLayout,
           typename CLayout,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
-std::tuple<float, int> invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
-                                   ck_tile::DeviceMem& b_k_n_dev_buf,
-                                   ck_tile::DeviceMem& c_m_n_dev_buf,
-                                   ck_tile::index_t M,
-                                   ck_tile::index_t N,
-                                   ck_tile::index_t K,
-                                   ck_tile::index_t stride_A,
-                                   ck_tile::index_t stride_B,
-                                   ck_tile::index_t stride_C,
-                                   int n_warmup,
-                                   int n_repeat,
-                                   bool flush_cache,
-                                   ck_tile::StreamKReductionStrategy reduction_strategy,
-                                   uint32_t num_sk_blocks)
+std::tuple<float, ck_tile::index_t>
+invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+            ck_tile::DeviceMem& b_k_n_dev_buf,
+            ck_tile::DeviceMem& c_m_n_dev_buf,
+            ck_tile::index_t M,
+            ck_tile::index_t N,
+            ck_tile::index_t K,
+            ck_tile::index_t stride_A,
+            ck_tile::index_t stride_B,
+            ck_tile::index_t stride_C,
+            int n_warmup,
+            int n_repeat,
+            bool flush_cache,
+            ck_tile::StreamKReductionStrategy reduction_strategy,
+            uint32_t num_sk_blocks)
 {
     ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
                                   b_k_n_dev_buf.GetDeviceBuffer(),
@@ -105,7 +84,7 @@ std::tuple<float, int> invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                   reduction_strategy,
                                   num_sk_blocks};
 
-    std::tuple<float, int> ave_time_and_batch;
+    std::tuple<float, ck_tile::index_t> ave_time_and_batch;
 
     if(args.reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)
     {
diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
index bb6b1eb413..40709e38e2 100644
--- a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
+++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
@@ -3,6 +3,7 @@
 
 #include "gemm_utils.hpp"
 #include "run_gemm_example.inc"
+#include "ck_tile/ops/common.hpp"
 
 template <typename GemmConfig,
           typename ADataType,
@@ -16,7 +17,8 @@ template <typename GemmConfig,
           typename ELayout,
           typename CDEElementWise,
           ck_tile::StreamKReductionStrategy ReductionStrategy>
-std::tuple<float, int> gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s)
+std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
+                                         const ck_tile::stream_config& s)
 
 {
     using GemmShape = ck_tile::TileGemmShape<
@@ -42,7 +44,7 @@ std::tuple<float, int> gemm(const ck_tile::StreamKHostArgs& args, const ck_tile:
                                                                  GemmConfig::NumWaveGroups,
                                                                  GemmConfig::Preshuffle>;
 
-    const auto Run = [&](const auto memory_operation) -> std::tuple<float, int> {
+    const auto Run = [&](const auto memory_operation) -> std::tuple<float, ck_tile::index_t> {
         // We create the GEMM pipeline without specifying has_hot_loop or tail_num.
         // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K
         // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K
@@ -113,7 +115,13 @@ std::tuple<float, int> gemm(const ck_tile::StreamKHostArgs& args, const ck_tile:
             preprocess,
             ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
-        int num_wgs_per_tile = estimate_num_wgs_per_tile<ReductionStrategy>(kargs.tile_partitioner);
+        ck_tile::index_t num_wgs_per_tile = ck_tile::estimate_num_wgs_per_tile<ReductionStrategy>(
+            kargs.tile_partitioner.sk_num_blocks,
+            // k_iters_per_big_block could be 1, which indicates that all Stream-K workgroups are
+            // big and each does one iteration. Thus, we ensure the value passed in is at least 1 to
+            // avoid division by zero errors.
+            ck_tile::max(kargs.tile_partitioner.k_iters_per_big_block - 1, 1u),
+            kargs.tile_partitioner.k_iters_per_tile.get());
 
         return std::tuple{ave_time, num_wgs_per_tile};
     };
diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp
index 5dbe6223c4..c01e967dcd 100644
--- a/include/ck_tile/ops/common/streamk_common.hpp
+++ b/include/ck_tile/ops/common/streamk_common.hpp
@@ -11,4 +11,33 @@ enum StreamKReductionStrategy : uint32_t
     Atomic    = 0u,
     Reduction = 1u
 };
+
+/**
+ * @brief Estimates the number of Stream-K workgroups per macro tile in the C tensor.
+ *
+ * @param sk_ctas           Number of Stream-K workgroups.
+ * @param iters_per_sk_cta  Number of iterations per Stream-K workgroup.
+ * @param iters_per_tile    Number of iterations per tile (i.e., the number of macro tiles in the K
+ * dimension).
+ * @return ck_tile::index_t An estimate of the number of workgroups per macro tile in the C tensor.
+ * @note It is assumed that `iters_per_sk_cta` > 0.
+ */
+template <ck_tile::StreamKReductionStrategy ReductionStrategy>
+ck_tile::index_t
+estimate_num_wgs_per_tile(index_t sk_ctas, index_t iters_per_sk_cta, index_t iters_per_tile)
+{
+    // In the case of non-atomic reduction or data-parallel only, there will always be 1 workgroup
+    // writing final results to a given macro tile in C.
+    int num_wgs_per_tile = 1;
+
+    // Otherwise, for atomics, multiple workgroups may be writing to the same macro tile in C.
+    if(sk_ctas > 0 && ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
+    {
+        // Estimate the number of workgroups per macro tile.
+        num_wgs_per_tile =
+            (iters_per_tile / iters_per_sk_cta) + ((iters_per_tile % iters_per_sk_cta) != 0);
+    }
+
+    return std::max(num_wgs_per_tile, 1);
+}
 } // namespace ck_tile
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp
index da0b8d153d..c341789435 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp
@@ -10,6 +10,7 @@
 #include <tuple>
 
 #include "ck_tile/host.hpp"
+#include "ck_tile/ops/common.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
@@ -50,10 +51,10 @@ class TestCkTileStreamK : public ::testing::Test
               bool PadK       = true,
               bool Preshuffle = false,
               bool TransposeC = false>
-    bool invoke_streamk(const ck_tile::StreamKHostArgs& args,
-                        const ck_tile::stream_config& s,
-                        int num_cu,
-                        int occupancy)
+    std::tuple<bool, ck_tile::index_t> invoke_streamk(const ck_tile::StreamKHostArgs& args,
+                                                      const ck_tile::stream_config& s,
+                                                      int num_cu,
+                                                      int occupancy)
     {
         constexpr bool kPadM      = PadM;
         constexpr bool kPadN      = PadN;
@@ -129,7 +130,7 @@ class TestCkTileStreamK : public ::testing::Test
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
-                return false;
+                return std::tuple{false, -1};
             }
 
             dim3 grid_dims  = Kernel::GridSize(kargs.tile_partitioner);
@@ -138,7 +139,16 @@ class TestCkTileStreamK : public ::testing::Test
             ck_tile::launch_kernel(
                 s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grid_dims, block_dims, 0, kargs));
 
-            return true;
+            ck_tile::index_t num_accumulations_per_tile =
+                ck_tile::estimate_num_wgs_per_tile<ReductionStrategy>(
+                    kargs.tile_partitioner.sk_num_blocks,
+                    // k_iters_per_big_block could be 1, which indicates that all blocks are
+                    // big and each does one iteration. Thus, we ensure the value passed in is at
+                    // least 1 to avoid division by zero errors.
+                    ck_tile::max(kargs.tile_partitioner.k_iters_per_big_block - 1, 1u),
+                    kargs.tile_partitioner.k_iters_per_tile.get());
+
+            return std::tuple{true, num_accumulations_per_tile};
         };
 
         return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
@@ -238,8 +248,11 @@ class TestCkTileStreamK : public ::testing::Test
                                       reduction_strategy,
                                       num_sk_blocks};
 
-        if(!invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
-               args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy))
+        const auto [is_valid_instance, num_accumulations_per_tile] =
+            invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
+                args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy);
+
+        if(!is_valid_instance)
         {
             GTEST_SKIP() << "Skipping this test: The kernel cannot solve the problem\n";
         }
@@ -256,7 +269,7 @@ class TestCkTileStreamK : public ::testing::Test
         const float max_accumulated_value =
             *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
         const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, /*kbatch*/ 1, max_accumulated_value);
+            K, num_accumulations_per_tile, max_accumulated_value);
 
         bool pass = ck_tile::check_err(c_m_n_dev_result,
                                        c_m_n_host_ref,

From 889ffc0b1d9a6913ee84f44c08d690a1e4d4828d Mon Sep 17 00:00:00 2001
From: Yashvardhan Agarwal <yashagar@amd.com>
Date: Fri, 17 Oct 2025 19:49:21 +0300
Subject: [PATCH 4/7] fix identity values in Max and AbsMax (#3048)

- The identity value method returned the minimum positive number while
we need the lowest number for Max and AbsMax operations
---
 include/ck_tile/core/utility/reduce_operator.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp
index a698c91e45..f870bd99d6 100644
--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -73,7 +73,7 @@ struct Max
                                           std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
     {
-        return numeric<T>::min();
+        return numeric<T>::lowest();
     };
 
     template <typename T,
@@ -96,7 +96,7 @@ struct AbsMax
                                           std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
     {
-        return numeric<T>::min();
+        return numeric<T>::lowest();
     };
 
     template <typename T,

From b03764ca5a917752845ddbb5da8886051a16d9be Mon Sep 17 00:00:00 2001
From: AviralGoelAMD <aviral.goel@amd.com>
Date: Wed, 15 Oct 2025 02:39:04 +0000
Subject: [PATCH 5/7] docs: add inline comments about flush_cache and rotating
 buffer

---
 include/ck_tile/host/flush_icache.hpp     |  6 ++++
 include/ck_tile/host/rotating_buffers.hpp | 41 ++++++++++++++++++-----
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp
index 9230b50a13..f4852252be 100644
--- a/include/ck_tile/host/flush_icache.hpp
+++ b/include/ck_tile/host/flush_icache.hpp
@@ -6,6 +6,12 @@
 #include <hip/hip_runtime.h>
 
 namespace ck_tile {
+// GPU kernel to invalidate instruction cache for accurate benchmarking.
+// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
+//               forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
+// 16x s_nop:    Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
+//               exits. Without these NOPs, the flush may not finish, leading to inconsistent
+//               timing measurements where some instructions remain cached.
 static __global__ void flush_cache()
 {
     asm __volatile__("s_icache_inv \n\t"
diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp
index 86f68ad084..154d67fb8e 100644
--- a/include/ck_tile/host/rotating_buffers.hpp
+++ b/include/ck_tile/host/rotating_buffers.hpp
@@ -9,6 +9,20 @@
 
 namespace ck_tile {
 
+// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
+//
+// Purpose:
+//   When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
+//   will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
+//   timing measurements. This wrapper rotates through multiple copies of buffers at different
+//   memory addresses to force cache misses.
+//
+// How it works:
+//   Constructor: Creates rotating_count copies of matrices A and B in GPU memory
+//   Next():      Switches pointers to the next buffer copy (cycles through all copies)
+//   Destructor:  Frees extra buffer copies and restores original pointers
+//
+// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
 template <typename ADataType, typename BDataType>
 struct RotatingMemWrapper
 {
@@ -24,15 +38,18 @@ struct RotatingMemWrapper
           size_a(size_a_),
           size_b(size_b_)
     {
+        // Store original buffer pointers as first entry
         p_a_grids.push_back(a_ptr);
         p_b_grids.push_back(b_ptr);
+
+        // Create (rotating_count - 1) additional copies at different memory addresses
         for(size_t i = 1; i < rotating_count; i++)
         {
             {
                 void* pADeviceBuf;
                 HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
-                                          const_cast<void*>(p_a_grids[0]),
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
+                                          const_cast<void*>(p_a_grids[0]), // source buffer
                                           size_a_,
                                           hipMemcpyDeviceToDevice));
                 p_a_grids.push_back(pADeviceBuf);
@@ -41,19 +58,21 @@ struct RotatingMemWrapper
             {
                 void* pBDeviceBuf;
                 HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
-                                          const_cast<void*>(p_b_grids[0]),
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
+                                          const_cast<void*>(p_b_grids[0]), // source buffer
                                           size_b_,
                                           hipMemcpyDeviceToDevice));
                 p_b_grids.push_back(pBDeviceBuf);
             }
         }
     }
+    // Rotate to the next buffer copy. Call this before each kernel run to use different
+    // memory addresses, forcing the GPU to fetch data from HBM instead of cache.
     void Next()
     {
         if(rotating_count > 1)
         {
-            std::size_t idx = iter++ % rotating_count;
+            std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
             a_ptr           = p_a_grids[idx];
             b_ptr           = p_b_grids[idx];
         }
@@ -63,15 +82,16 @@ struct RotatingMemWrapper
         std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
                   << ", rotating_count: " << rotating_count << "}" << std::endl;
     }
+    // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
     ~RotatingMemWrapper() noexcept
     {
         if(rotating_count > 1)
         {
-            // restore ptr
+            // Restore original buffer pointers
             a_ptr = p_a_grids[0];
             b_ptr = p_b_grids[0];
 
-            // free device mem
+            // Free extra buffer copies (index 0 is the original, don't free it)
             for(size_t i = 1; i < rotating_count; i++)
             {
                 ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
@@ -94,7 +114,12 @@ inline void flush_icache()
 {
     hipDeviceProp_t deviceProps;
     HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-    int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
+
+    // Over-provision blocks to ensure all CUs execute the flush instruction.
+    // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
+    // 60x over-provisioning provides statistical certainty that every CU gets at least one block.
+    constexpr int32_t blocks_per_cu = 60;
+    int32_t gpu_block3              = deviceProps.multiProcessorCount * blocks_per_cu;
 
     ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
     HIP_CHECK_ERROR(hipGetLastError());

From d88ea05c844cd159a14213b73a5818a43c5b79e6 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 17 Oct 2025 19:52:22 -0700
Subject: [PATCH 6/7] disable aiter test gemm_a8w8_blockscale (#3049)

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3fbcdb5849..43b51d4f0f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -909,7 +909,7 @@ def run_aiter_tests(Map conf=[:]){
                 sh "rocminfo"
                 sh "python3 --version"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
+                //sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py" //temporarily disable
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"

From af3786fe0814a75646ff3194f86eab0e24b047e6 Mon Sep 17 00:00:00 2001
From: BrianHarrisonAMD <169072757+BrianHarrisonAMD@users.noreply.github.com>
Date: Sun, 19 Oct 2025 17:09:21 -0600
Subject: [PATCH 7/7] Add dvc pull step (#3056)

* Add dvc pull step

* Remove CD

* Add details about LOGNAME and fail if dvc isn't installed
---
 .github/workflows/therock-ci-linux.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml
index beaabbe763..f4d0c0063c 100644
--- a/.github/workflows/therock-ci-linux.yml
+++ b/.github/workflows/therock-ci-linux.yml
@@ -35,6 +35,15 @@ jobs:
         with:
           repository: "ROCm/rocm-libraries"
 
+      - name: Pull DVC files for rocm-libraries # LOGNAME details here https://github.com/ROCm/rocm-libraries/pull/1617
+        run: |
+          if command -v dvc &> /dev/null; then
+            echo "dvc detected"
+          else
+            echo "Warning, dvc not detected!"
+          fi
+          LOGNAME=github-runner dvc pull -v
+
       - name: Checkout composable_kernel repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with: