From 98696413248802ab8007b709e5fc76988b5600b6 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 12 Dec 2025 09:27:12 -0800
Subject: [PATCH 01/10] disable test_tile_gemm_quant_bquant_preshuffle (#3420)

---
 test/ck_tile/gemm_block_scale/CMakeLists.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt
index 8309b14f0a..2b0ffaafa2 100755
--- a/test/ck_tile/gemm_block_scale/CMakeLists.txt
+++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt
@@ -24,10 +24,12 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     target_compile_options(test_tile_gemm_quant_bquant PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
 
     # BQuant tests (with PreshuffleB)
-    add_gtest_executable(test_tile_gemm_quant_bquant_preshuffle 
-        test_gemm_quant_bquant_preshuffle.cpp
-    )
-    target_compile_options(test_tile_gemm_quant_bquant_preshuffle PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
+    # disabling this test until it can be built within reasonable time!
+    # currently taking ~50 minutes on gfx12!
+    #add_gtest_executable(test_tile_gemm_quant_bquant_preshuffle
+    #    test_gemm_quant_bquant_preshuffle.cpp
+    #)
+    #target_compile_options(test_tile_gemm_quant_bquant_preshuffle PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
 
     # RowColQuant tests
     add_gtest_executable(test_tile_gemm_quant_rowcol 

From fc7bf0ab1c5ed28e5962681007f84a2e8d3ee051 Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Sat, 13 Dec 2025 01:28:37 +0800
Subject: [PATCH 02/10] [CK_TILE] Port hw independent changes from internal
 repo to develop branch (#3301)

* [CK_TILE] Port hw independent changes from internal repo to develop branch

It includes PR#96, #114, #120, #121.

* correct rebase error
---
 example/ck_tile/03_gemm/gemm_utils.hpp        |   2 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |   4 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   |   2 +
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   |   1 +
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   |   7 +-
 .../ops/gemm/kernel/universal_gemm_kernel.hpp |   2 +-
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     |  32 +-
 .../gemm_pipeline_agmem_bgmem_creg_v2.hpp     |  25 +-
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |   5 +-
 .../ops/reduce/block/block_reduce2d.hpp       |   2 +-
 include/ck_tile/utility/json_dump.hpp         | 475 +++++++++---------
 .../epilogue/test_cshuffle_epilogue_util.hpp  |   2 +-
 test/ck_tile/gemm_multi_abd/CMakeLists.txt    |   2 +-
 .../test_gemm_multi_abd_cshuffle.cpp          |  15 +-
 .../test_gemm_multi_abd_default2d.cpp         |   8 +-
 .../test_gemm_multi_abd_util.hpp              |  36 +-
 .../test_gemm_pipeline_util.hpp               |  24 +-
 .../grouped_gemm_multi_d/CMakeLists.txt       |   2 +-
 .../test_grouped_gemm_multi_d.cpp             |  53 +-
 .../grouped_gemm_preshuffle/CMakeLists.txt    |   2 +-
 .../test_grouped_gemm_preshuffle.cpp          |  12 +-
 .../test_grouped_gemm_preshuffle_util.hpp     |  62 ++-
 22 files changed, 465 insertions(+), 310 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index b25aec101b..47c47334e7 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -459,7 +459,7 @@ struct PipelineTypeTraits<ck_tile::GemmPipeline::PRESHUFFLE_V2>
         ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
 };
 
-auto create_args()
+inline auto create_args()
 {
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("m", "3840", "m dimension")
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index c4f100b36b..78f3a9b0b3 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -197,8 +197,8 @@ bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
     return pass;
 }
 
-std::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t>
-parse_gemm_size(ck_tile::ArgParser& arg_parser)
+std::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t> inline parse_gemm_size(
+    ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 3445f063f5..52b2b86574 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -986,6 +986,8 @@ struct MoeSortingKernel
                 p_sorted_expert_ids[unit_size_mdiv.div(i)] = expert_id;
             }
         }
+        __syncthreads();
+
         smem_cumdup(num_experts) = smem_cumsum(num_experts);
 
         // fill the p_sorted_token_ids/p_sorted_weights
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 63993c5eb6..838fc236d2 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -561,6 +561,7 @@ struct GroupedGemmKernel
                 const auto block_idx_2d = OffsetTile1DPartitioner::GetOffsetedTileIndex(
                     0, kargs.M, kargs.N, (block_id - block_start) % grid_size_2d);
                 Run(kargs, block_idx_2d, (block_id - block_start) / grid_size_2d);
+                block_sync_lds();
                 block_id = block_id + grid_size; // advance to next block
                 // NOTE: this check is redundant but helps the compiler avoid spilling some VGPR
                 if(block_id >= cum_grid_size)
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index 91f1358321..6130107cfe 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -631,6 +631,7 @@ struct StreamKKernel
             tile_idx += kargs.tile_partitioner.get_grid())
         {
             BaseGemm(kargs, tile_idx, dp_num_loop, 0, 0, kargs.K, smem_ptr_0);
+            block_sync_lds();
         }
 
         // Stream-K section
@@ -679,8 +680,8 @@ struct StreamKKernel
     {
         hipDeviceProp_t dev_prop;
         hipDevice_t dev;
-        hip_check_error(hipGetDevice(&dev));
-        hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
+        ck_tile::hip_check_error(hipGetDevice(&dev));
+        ck_tile::hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
         int num_cu = dev_prop.multiProcessorCount;
 
         return num_cu;
@@ -700,7 +701,7 @@ struct StreamKKernel
         constexpr int min_block_per_cu = 1;
         const auto kernel              = kentry<min_block_per_cu, Kernel, KernelArgs>;
 
-        hip_check_error(
+        ck_tile::hip_check_error(
             hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, kBlockSize, 0));
 
         return max(occupancy, 1);
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 4b28ac3f12..866a4cc693 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -280,7 +280,7 @@ struct UniversalGemmKernel
         using Kernel      = UniversalGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
         const auto kernel = kentry<1, Kernel, KernelArgs>;
         int occupancy;
-        hip_check_error(
+        ck_tile::hip_check_error(
             hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize().x, 0));
 
         const int grid_size = get_available_compute_units(s) * occupancy;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 16ed8de22f..936c38ddf3 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -9,11 +9,35 @@
 
 namespace ck_tile {
 
+template <typename Problem>
+struct BaseGemmPipelineAGmemBGmemCRegV1
+{
+    static constexpr index_t PrefetchStages   = 1;
+    static constexpr index_t PrefillStages    = 1;
+    static constexpr index_t GlobalBufferNum  = 1;
+    static constexpr bool UsePersistentKernel = false;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t) { return true; }
+
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t)
+    {
+        return TailNumber::Empty;
+    }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool, TailNumber)
+    {
+        return run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Empty>{});
+    }
+};
+
 //  A Tile Window: global memory
 //  B Tile Window: global memory
 //  C Distributed tensor: register
 template <typename Problem, typename Policy = UniversalGemmPipelineAgBgCrPolicy>
-struct GemmPipelineAGmemBGmemCRegV1
+struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1<Problem>
 {
     using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
     using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
@@ -48,14 +72,14 @@ struct GemmPipelineAGmemBGmemCRegV1
     template <bool IsWave32Host = false>
     static constexpr index_t GetVectorSizeA()
     {
-        return Problem::VectorSizeA;
+        return Policy::template GetVectorSizeA<Problem, IsWave32Host>();
     }
     template <bool IsWave32Host = false>
     static constexpr index_t GetVectorSizeB()
     {
-        return Problem::VectorSizeB;
+        return Policy::template GetVectorSizeB<Problem, IsWave32Host>();
     }
-    static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
+    static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
     static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
     static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index 5dbcde80a6..c711c768ec 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -9,11 +9,34 @@
 
 namespace ck_tile {
 
+template <typename Problem>
+struct BaseGemmPipelineAGmemBGmemCRegV2
+{
+    static constexpr index_t PrefetchStages   = 2;
+    static constexpr index_t PrefillStages    = 1;
+    static constexpr index_t GlobalBufferNum  = 1;
+    static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t) { return true; }
+
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t)
+    {
+        return TailNumber::Empty;
+    }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool, TailNumber)
+    {
+        return run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Empty>{});
+    }
+};
 //  A Tile Window: global memory
 //  B Tile Window: global memory
 //  C Distributed tensor: register
 template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV2DefaultPolicy>
-struct GemmPipelineAGmemBGmemCRegV2
+struct GemmPipelineAGmemBGmemCRegV2 : public BaseGemmPipelineAGmemBGmemCRegV2<Problem>
 {
     using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
     using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index d76fd6dc0f..47607a40f5 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -43,13 +43,14 @@ template <bool kPadM_,
           bool UseStructuredSparsity_ = false,
           bool UsePersistentKernel_   = false,
           index_t NumWaveGroups_      = 1,
-          bool Preshuffle_            = false>
+          bool Preshuffle_            = false,
+          int VectorSize_             = 16>
 struct TileGemmUniversalTraits
 {
     static constexpr bool kPadM            = kPadM_;
     static constexpr bool kPadN            = kPadN_;
     static constexpr bool kPadK            = kPadK_;
-    static constexpr int _VectorSize       = 16;
+    static constexpr int _VectorSize       = VectorSize_;
     static constexpr bool DoubleSmemBuffer = DoubleSmemBuffer_;
 
     using AsLayout = AsLayout_;
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index cbf4afefb2..ba6ed27651 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -425,7 +425,7 @@ struct BlockReduce2dCrossWarpSync
 
         if constexpr(num_reduce_warps == 1)
             return;
-
+        block_sync_lds();
         // Each warp's lane 0 writes its partial results to shared memory
         const index_t smem_offset = warp_id;
         if(lane_id == 0)
diff --git a/include/ck_tile/utility/json_dump.hpp b/include/ck_tile/utility/json_dump.hpp
index b5bab28cac..03e97c0b76 100644
--- a/include/ck_tile/utility/json_dump.hpp
+++ b/include/ck_tile/utility/json_dump.hpp
@@ -160,23 +160,23 @@ void dump_gemm_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_batched_gemm_json_results(const std::string& json_filename,
-                                    const std::string& op_name,
-                                    int M,
-                                    int N,
-                                    int K,
-                                    int stride_A,
-                                    int stride_B,
-                                    int stride_C,
-                                    int batch_stride_A,
-                                    int batch_stride_B,
-                                    int batch_stride_C,
-                                    int batch_count,
-                                    bool pass,
-                                    float ave_time,
-                                    float tflops,
-                                    float gb_per_sec,
-                                    const std::string& kernel_name = "batched_gemm_basic")
+inline void dump_batched_gemm_json_results(const std::string& json_filename,
+                                           const std::string& op_name,
+                                           int M,
+                                           int N,
+                                           int K,
+                                           int stride_A,
+                                           int stride_B,
+                                           int stride_C,
+                                           int batch_stride_A,
+                                           int batch_stride_B,
+                                           int batch_stride_C,
+                                           int batch_count,
+                                           bool pass,
+                                           float ave_time,
+                                           float tflops,
+                                           float gb_per_sec,
+                                           const std::string& kernel_name = "batched_gemm_basic")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -218,20 +218,20 @@ void dump_grouped_gemm_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_flatmm_json_results(const std::string& json_filename,
-                              const std::string& datatype,
-                              int M,
-                              int N,
-                              int K,
-                              int stride_A,
-                              int stride_B,
-                              int stride_C,
-                              int kbatch,
-                              bool pass,
-                              float ave_time,
-                              float tflops,
-                              float gb_per_sec,
-                              const std::string& kernel_name = "flatmm_basic")
+inline void dump_flatmm_json_results(const std::string& json_filename,
+                                     const std::string& datatype,
+                                     int M,
+                                     int N,
+                                     int K,
+                                     int stride_A,
+                                     int stride_B,
+                                     int stride_C,
+                                     int kbatch,
+                                     bool pass,
+                                     float ave_time,
+                                     float tflops,
+                                     float gb_per_sec,
+                                     const std::string& kernel_name = "flatmm_basic")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -248,21 +248,22 @@ void dump_flatmm_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_gemm_multi_d_fp16_json_results(const std::string& json_filename,
-                                         const std::string& op_name,
-                                         int M,
-                                         int N,
-                                         int K,
-                                         int StrideA,
-                                         int StrideB,
-                                         int StrideD0,
-                                         int StrideD1,
-                                         int StrideE,
-                                         bool pass,
-                                         float ave_time,
-                                         float tflops,
-                                         float gb_per_sec,
-                                         const std::string& kernel_name = "gemm_multi_d_fp16")
+inline void
+dump_gemm_multi_d_fp16_json_results(const std::string& json_filename,
+                                    const std::string& op_name,
+                                    int M,
+                                    int N,
+                                    int K,
+                                    int StrideA,
+                                    int StrideB,
+                                    int StrideD0,
+                                    int StrideD1,
+                                    int StrideE,
+                                    bool pass,
+                                    float ave_time,
+                                    float tflops,
+                                    float gb_per_sec,
+                                    const std::string& kernel_name = "gemm_multi_d_fp16")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -280,14 +281,14 @@ void dump_gemm_multi_d_fp16_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_elementwise_json_results(const std::string& json_filename,
-                                   const std::string& prec,
-                                   int grid_size,
-                                   int block_size,
-                                   float ave_time,
-                                   float tflops,
-                                   float gb_per_sec,
-                                   const std::string& kernel_name = "elementwise")
+inline void dump_elementwise_json_results(const std::string& json_filename,
+                                          const std::string& prec,
+                                          int grid_size,
+                                          int block_size,
+                                          float ave_time,
+                                          float tflops,
+                                          float gb_per_sec,
+                                          const std::string& kernel_name = "elementwise")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -298,22 +299,22 @@ void dump_elementwise_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_layernorm2d_fwd_json_results(const std::string& json_filename,
-                                       const std::string& prec_i,
-                                       const std::string& prec_o,
-                                       const std::string& prec_sm,
-                                       const std::string& prec_sy,
-                                       int m,
-                                       int n,
-                                       int x_stride,
-                                       int xr_stride,
-                                       int y_stride,
-                                       int yr_stride,
-                                       bool pass,
-                                       float ave_time,
-                                       float tflops,
-                                       float gb_per_sec,
-                                       const std::string& kernel_name = "layernorm2d_fwd")
+inline void dump_layernorm2d_fwd_json_results(const std::string& json_filename,
+                                              const std::string& prec_i,
+                                              const std::string& prec_o,
+                                              const std::string& prec_sm,
+                                              const std::string& prec_sy,
+                                              int m,
+                                              int n,
+                                              int x_stride,
+                                              int xr_stride,
+                                              int y_stride,
+                                              int yr_stride,
+                                              bool pass,
+                                              float ave_time,
+                                              float tflops,
+                                              float gb_per_sec,
+                                              const std::string& kernel_name = "layernorm2d_fwd")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -357,13 +358,13 @@ void dump_reduce_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_permute_json_results(const std::string& json_filename,
-                               const std::string& data_type,
-                               bool pass,
-                               float ave_time,
-                               float tflop,
-                               float gb_per_sec,
-                               const std::string& kernel_name = "permute")
+inline void dump_permute_json_results(const std::string& json_filename,
+                                      const std::string& data_type,
+                                      bool pass,
+                                      float ave_time,
+                                      float tflop,
+                                      float gb_per_sec,
+                                      const std::string& kernel_name = "permute")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -373,19 +374,19 @@ void dump_permute_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_topk_softmax_json(const std::string& json_filename,
-                            const std::string& input_prec,
-                            const std::string& weight_prec,
-                            int tokens,
-                            int experts,
-                            int topk,
-                            int stride_input,
-                            int stride_output,
-                            float ave_time,
-                            float tflop,
-                            float gb_per_sec,
-                            bool pass,
-                            const std::string& kernel_name = "topk_softmax")
+inline void dump_topk_softmax_json(const std::string& json_filename,
+                                   const std::string& input_prec,
+                                   const std::string& weight_prec,
+                                   int tokens,
+                                   int experts,
+                                   int topk,
+                                   int stride_input,
+                                   int stride_output,
+                                   float ave_time,
+                                   float tflop,
+                                   float gb_per_sec,
+                                   bool pass,
+                                   const std::string& kernel_name = "topk_softmax")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -401,20 +402,20 @@ void dump_topk_softmax_json(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_rmsnorm2d_fwd_json(const std::string& json_filename,
-                             const std::string& prec_str,
-                             int m,
-                             int n,
-                             int x_stride,
-                             int xr_stride,
-                             int y_stride,
-                             int yr_stride,
-                             int use_model_sensitive_rmsnorm,
-                             float ave_time,
-                             float tflops,
-                             float gb_per_sec,
-                             bool pass,
-                             const std::string& kernel_name = "rmsnorm2d_fwd")
+inline void dump_rmsnorm2d_fwd_json(const std::string& json_filename,
+                                    const std::string& prec_str,
+                                    int m,
+                                    int n,
+                                    int x_stride,
+                                    int xr_stride,
+                                    int y_stride,
+                                    int yr_stride,
+                                    int use_model_sensitive_rmsnorm,
+                                    float ave_time,
+                                    float tflops,
+                                    float gb_per_sec,
+                                    bool pass,
+                                    const std::string& kernel_name = "rmsnorm2d_fwd")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -431,19 +432,19 @@ void dump_rmsnorm2d_fwd_json(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_add_rmsnorm2d_rdquant_fwd_json(
-    const std::string& json_filename,
-    const std::string& input_data_type,
-    const std::string& quantized_data_type,
-    int m,
-    int n,
-    int stride,
-    float epsilon,
-    float ave_time,
-    float tflops,
-    float gb_per_sec,
-    bool pass,
-    const std::string& kernel_name = "add_rmsnorm2d_rdquant_fwd")
+inline void
+dump_add_rmsnorm2d_rdquant_fwd_json(const std::string& json_filename,
+                                    const std::string& input_data_type,
+                                    const std::string& quantized_data_type,
+                                    int m,
+                                    int n,
+                                    int stride,
+                                    float epsilon,
+                                    float ave_time,
+                                    float tflops,
+                                    float gb_per_sec,
+                                    bool pass,
+                                    const std::string& kernel_name = "add_rmsnorm2d_rdquant_fwd")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -458,17 +459,17 @@ void dump_add_rmsnorm2d_rdquant_fwd_json(
     END_JSON_DUMP_FILE();
 }
 
-void dump_smoothquant_json(const std::string& json_filename,
-                           const std::string& prec_str,
-                           int m,
-                           int n,
-                           int x_stride,
-                           int y_stride,
-                           float ave_time,
-                           float tflops,
-                           float gb_per_sec,
-                           bool pass,
-                           const std::string& kernel_name = "smoothquant")
+inline void dump_smoothquant_json(const std::string& json_filename,
+                                  const std::string& prec_str,
+                                  int m,
+                                  int n,
+                                  int x_stride,
+                                  int y_stride,
+                                  float ave_time,
+                                  float tflops,
+                                  float gb_per_sec,
+                                  bool pass,
+                                  const std::string& kernel_name = "smoothquant")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -482,19 +483,19 @@ void dump_smoothquant_json(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_moe_sorting_json(const std::string& json_filename,
-                           const std::string& index_prec,
-                           const std::string& weight_prec,
-                           const std::string& workspace_size,
-                           int dispatch_policy,
-                           int tokens,
-                           int num_experts,
-                           int topk,
-                           float ave_time,
-                           float tflops,
-                           float gb_per_sec,
-                           bool pass,
-                           const std::string& kernel_name = "moe_sorting")
+inline void dump_moe_sorting_json(const std::string& json_filename,
+                                  const std::string& index_prec,
+                                  const std::string& weight_prec,
+                                  const std::string& workspace_size,
+                                  int dispatch_policy,
+                                  int tokens,
+                                  int num_experts,
+                                  int topk,
+                                  float ave_time,
+                                  float tflops,
+                                  float gb_per_sec,
+                                  bool pass,
+                                  const std::string& kernel_name = "moe_sorting")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -510,19 +511,19 @@ void dump_moe_sorting_json(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_batched_transpose_json(const std::string& json_filename,
-                                 int N,
-                                 int C,
-                                 int H,
-                                 int W,
-                                 const std::string& layout_in,
-                                 const std::string& layout_out,
-                                 const std::string& prec,
-                                 float ave_time,
-                                 float tflops,
-                                 float gb_per_sec,
-                                 bool pass,
-                                 const std::string& kernel_name = "batched_transpose")
+inline void dump_batched_transpose_json(const std::string& json_filename,
+                                        int N,
+                                        int C,
+                                        int H,
+                                        int W,
+                                        const std::string& layout_in,
+                                        const std::string& layout_out,
+                                        const std::string& prec,
+                                        float ave_time,
+                                        float tflops,
+                                        float gb_per_sec,
+                                        bool pass,
+                                        const std::string& kernel_name = "batched_transpose")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -538,19 +539,19 @@ void dump_batched_transpose_json(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_moe_smoothquant_json(const std::string& json_filename,
-                               const std::string& prec_i,
-                               const std::string& prec_o,
-                               int tokens,
-                               int hidden_size,
-                               int stride,
-                               int experts,
-                               int topk,
-                               bool pass,
-                               float ave_time,
-                               float tflops,
-                               float gb_per_sec,
-                               const std::string& kernel_name = "moe_smoothquant")
+inline void dump_moe_smoothquant_json(const std::string& json_filename,
+                                      const std::string& prec_i,
+                                      const std::string& prec_o,
+                                      int tokens,
+                                      int hidden_size,
+                                      int stride,
+                                      int experts,
+                                      int topk,
+                                      bool pass,
+                                      float ave_time,
+                                      float tflops,
+                                      float gb_per_sec,
+                                      const std::string& kernel_name = "moe_smoothquant")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -566,26 +567,26 @@ void dump_moe_smoothquant_json(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_fused_moe_json(const std::string& json_filename,
-                         const std::string& api_str,
-                         const std::string& prec_str,
-                         int tokens,
-                         bool is_local_token,
-                         int local_tokens,
-                         int experts,
-                         int topk,
-                         int hidden_size,
-                         int intermediate_size,
-                         int stride,
-                         int block_m,
-                         int activation,
-                         bool gate_only,
-                         bool fused_quant,
-                         bool pass,
-                         float ave_time,
-                         float tflops,
-                         float tb_per_sec,
-                         const std::string& kernel_name = "fused_moe")
+inline void dump_fused_moe_json(const std::string& json_filename,
+                                const std::string& api_str,
+                                const std::string& prec_str,
+                                int tokens,
+                                bool is_local_token,
+                                int local_tokens,
+                                int experts,
+                                int topk,
+                                int hidden_size,
+                                int intermediate_size,
+                                int stride,
+                                int block_m,
+                                int activation,
+                                bool gate_only,
+                                bool fused_quant,
+                                bool pass,
+                                float ave_time,
+                                float tflops,
+                                float tb_per_sec,
+                                const std::string& kernel_name = "fused_moe")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -610,29 +611,29 @@ void dump_fused_moe_json(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_fmha_fwd_json_results(const std::string& json_filename,
-                                const std::string& prec,
-                                const std::string& mode,
-                                const std::string& io_layout,
-                                int batch,
-                                int nhead,
-                                int nhead_k,
-                                int seqlen_qs,
-                                int seqlen_ks,
-                                int seqlen_kpads,
-                                int hdim_q,
-                                int hdim_v,
-                                float scale_s,
-                                float p_drop,
-                                bool lse,
-                                const std::string& qscale,
-                                const std::string& bias,
-                                const std::string& vlayout,
-                                bool pass,
-                                float ave_time,
-                                float tflops,
-                                float gb_per_sec,
-                                const std::string& kernel_name = "fmha_fwd")
+inline void dump_fmha_fwd_json_results(const std::string& json_filename,
+                                       const std::string& prec,
+                                       const std::string& mode,
+                                       const std::string& io_layout,
+                                       int batch,
+                                       int nhead,
+                                       int nhead_k,
+                                       int seqlen_qs,
+                                       int seqlen_ks,
+                                       int seqlen_kpads,
+                                       int hdim_q,
+                                       int hdim_v,
+                                       float scale_s,
+                                       float p_drop,
+                                       bool lse,
+                                       const std::string& qscale,
+                                       const std::string& bias,
+                                       const std::string& vlayout,
+                                       bool pass,
+                                       float ave_time,
+                                       float tflops,
+                                       float gb_per_sec,
+                                       const std::string& kernel_name = "fmha_fwd")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
@@ -658,33 +659,33 @@ void dump_fmha_fwd_json_results(const std::string& json_filename,
     END_JSON_DUMP_FILE();
 }
 
-void dump_fmha_bwd_json_results(const std::string& json_filename,
-                                const std::string& data_type,
-                                const std::string& mode,
-                                const std::string& i_perm,
-                                const std::string& o_perm,
-                                int batch,
-                                int nhead,
-                                int nhead_k,
-                                int seqlen_q,
-                                int seqlen_k,
-                                int hdim_q,
-                                int hdim_v,
-                                float scale,
-                                const std::string& bias,
-                                bool use_dbias,
-                                float p_drop,
-                                bool s_randval,
-                                bool deterministic,
-                                const std::string& mask,
-                                int mask_left,
-                                int mask_right,
-                                int workspace_size,
-                                bool pass,
-                                float ave_time,
-                                float tflops,
-                                float gb_per_sec,
-                                const std::string& kernel_name = "fmha_bwd")
+inline void dump_fmha_bwd_json_results(const std::string& json_filename,
+                                       const std::string& data_type,
+                                       const std::string& mode,
+                                       const std::string& i_perm,
+                                       const std::string& o_perm,
+                                       int batch,
+                                       int nhead,
+                                       int nhead_k,
+                                       int seqlen_q,
+                                       int seqlen_k,
+                                       int hdim_q,
+                                       int hdim_v,
+                                       float scale,
+                                       const std::string& bias,
+                                       bool use_dbias,
+                                       float p_drop,
+                                       bool s_randval,
+                                       bool deterministic,
+                                       const std::string& mask,
+                                       int mask_left,
+                                       int mask_right,
+                                       int workspace_size,
+                                       bool pass,
+                                       float ave_time,
+                                       float tflops,
+                                       float gb_per_sec,
+                                       const std::string& kernel_name = "fmha_bwd")
 {
     START_JSON_DUMP_FILE(json_filename);
     ADD_KEY_VALUE("name", kernel_name);
diff --git a/test/ck_tile/epilogue/test_cshuffle_epilogue_util.hpp b/test/ck_tile/epilogue/test_cshuffle_epilogue_util.hpp
index 4fdbf23864..9b90110c07 100644
--- a/test/ck_tile/epilogue/test_cshuffle_epilogue_util.hpp
+++ b/test/ck_tile/epilogue/test_cshuffle_epilogue_util.hpp
@@ -130,7 +130,7 @@ auto run_cshuffle_epilogue_test(ScaleType scale = ScaleType::None)
 
     constexpr index_t kMPerBlock = Problem::kMPerBlock;
     constexpr index_t kNPerBlock = Problem::kNPerBlock;
-    constexpr index_t kBlockSize = Problem::kBlockSize;
+    index_t kBlockSize = ck_tile::is_wave32() ? Problem::kBlockSize / 2 : Problem::kBlockSize;
 
     std::cout << "Running CShuffleEpilogue test with M=" << M << ", N=" << N
               << ", MPerBlock=" << kMPerBlock << ", NPerBlock=" << kNPerBlock
diff --git a/test/ck_tile/gemm_multi_abd/CMakeLists.txt b/test/ck_tile/gemm_multi_abd/CMakeLists.txt
index 2dccf9cd60..03759652cd 100644
--- a/test/ck_tile/gemm_multi_abd/CMakeLists.txt
+++ b/test/ck_tile/gemm_multi_abd/CMakeLists.txt
@@ -7,7 +7,7 @@ if(CK_USE_OCP_FP8)
     list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
 
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_gemm_multi_abd_cshuffle test_gemm_multi_abd_cshuffle.cpp)
     add_gtest_executable(test_ck_tile_gemm_multi_abd_default2d test_gemm_multi_abd_default2d.cpp)
     target_compile_definitions(test_ck_tile_gemm_multi_abd_cshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_cshuffle.cpp b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_cshuffle.cpp
index 08997529b2..ab00f16632 100644
--- a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_cshuffle.cpp
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_cshuffle.cpp
@@ -20,20 +20,21 @@ using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
 using KernelTypes = ::testing::Types<
     // Has cshuffle epilogue enabled
     //          A0Layout, A1Layout, B0Layout, B1Layout CLayout, D0Layout, D1Layout, A0DataType, A01DataType B0DataType, B0DataType, D0DataType,  D1DataType, AccDataType, EDataType, AElementWiseFn, BElementWiseFn, CDElementWiseFn, UseCshuffleEpilog
+#if !CK_TILE_USE_WMMA || CK_TILE_USE_OCP_FP8
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           F8,         F8,         F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           F8,         F8,         F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
+#endif    
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F16,        F16,        F32,      F16,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F16,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           F8,         F8,         F32,      F32,          AddScale,       AddScale,    ElementWiseAddAdd,  std::true_type>,
-
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F16,        F16,        F32,      F16,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F16,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           BF16,       BF16,       F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,            F8,        F8,        F8,           F8,         F8,         F32,      F32,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>
-    >;
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,           F16,       F16,       F16,          F32,        F32,        F32,      F16,          AddScale,       AddScale,    MultiplyMultiply,  std::true_type>
+     >;
 // clang-format on
 
 TYPED_TEST_SUITE(TestCkTileGemmMultiABD, KernelTypes);
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_default2d.cpp b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_default2d.cpp
index dac33b4656..c4bfc3e7cb 100644
--- a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_default2d.cpp
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_default2d.cpp
@@ -20,17 +20,19 @@ using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
 using KernelTypes = ::testing::Types<
     // Has cshuffle epilogue disabled
     //          A0Layout, A1Layout, B0Layout, B1Layout CLayout, D0Layout, D1Layout, A0DataType, A01DataType B0DataType, B0DataType, D0DataType,  D1DataType, AccDataType, EDataType, AElementWiseFn, BElementWiseFn, CDElementWiseFn, UseCshuffleEpilog
+#if !CK_TILE_USE_WMMA || CK_TILE_USE_OCP_FP8
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,           F8,         F8,         F8,           BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    ElementWiseAddAdd,  std::false_type>,
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,           F8,         F8,         F8,           BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    MultiplyMultiply,   std::false_type>,
+#endif
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F16,        F16,        F32,      F16,         AddScale,       AddScale,    ElementWiseAddAdd,  std::false_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F16,         AddScale,       AddScale,    ElementWiseAddAdd,  std::false_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F32,         AddScale,       AddScale,    ElementWiseAddAdd,  std::false_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    ElementWiseAddAdd,  std::false_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,           F8,         F8,         F8,           BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    ElementWiseAddAdd,  std::false_type>,
 
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F16,        F16,        F32,      F16,         AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F16,         AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
     std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          F32,        F32,        F32,      F32,         AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    MultiplyMultiply,  std::false_type>,
-    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F8,           F8,         F8,         F8,           BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    MultiplyMultiply,  std::false_type>
+    std::tuple<    Row,     Row,     Col,     Col,      Row,     Row,      Row,      F16,          F16,        F16,        F16,          BF16,       BF16,       F32,      BF16,        AddScale,       AddScale,    MultiplyMultiply,  std::false_type>
     >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_util.hpp b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_util.hpp
index ee045c7f48..8cee050db2 100644
--- a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_util.hpp
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_util.hpp
@@ -23,6 +23,28 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                  ck_tile::tensor_layout::gemm::RowMajor>>{};
 }
 
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if CK_TILE_USE_WMMA
+    return 16;
+#else
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+#endif
+}
+
 template <typename A0DataType,
           typename B0DataType,
           typename AccDataType,
@@ -103,17 +125,25 @@ class TestCkTileGemmMultiABD : public ::testing::Test
                                                                    DsDataType::size()>& args,
                                const ck_tile::stream_config& s)
     {
-        constexpr ck_tile::index_t M_Tile = 256;
-        constexpr ck_tile::index_t N_Tile = 256;
-        constexpr ck_tile::index_t K_Tile = 32;
+        constexpr ck_tile::index_t M_Tile = 128;
+        constexpr ck_tile::index_t N_Tile = 128;
+        constexpr ck_tile::index_t K_Tile = 64;
 
         constexpr ck_tile::index_t M_Warp = 2;
         constexpr ck_tile::index_t N_Warp = 2;
         constexpr ck_tile::index_t K_Warp = 1;
 
+#if CK_TILE_USE_WMMA
+        using ADataType =
+            ck_tile::remove_cvref_t<std::tuple_element_t<ck_tile::number<0>{}, AsDataType>>;
+        constexpr ck_tile::index_t M_Warp_Tile = 16;
+        constexpr ck_tile::index_t N_Warp_Tile = 16;
+        constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<ADataType, N_Warp_Tile>();
+#else
         constexpr ck_tile::index_t M_Warp_Tile = 32;
         constexpr ck_tile::index_t N_Warp_Tile = 32;
         constexpr ck_tile::index_t K_Warp_Tile = 16;
+#endif
 
         constexpr bool DoubleSmemBuffer = false;
 
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
index 43a73738d9..7c085b5098 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
@@ -13,6 +13,28 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if CK_TILE_USE_WMMA
+    return 16;
+#else
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+#endif
+}
+
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                          const ck_tile::index_t kbatch,
@@ -80,7 +102,7 @@ struct config_wmma
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<Datatype, M_Warp_Tile>();
 };
 
 template <typename Tuple>
diff --git a/test/ck_tile/grouped_gemm_multi_d/CMakeLists.txt b/test/ck_tile/grouped_gemm_multi_d/CMakeLists.txt
index f86da3c4d5..5363e365fc 100644
--- a/test/ck_tile/grouped_gemm_multi_d/CMakeLists.txt
+++ b/test/ck_tile/grouped_gemm_multi_d/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 # Use standard asm for rtn bf16 conversion instead of turncate
 list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
 
-if(GPU_TARGETS MATCHES "gfx94|gfx95")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_grouped_gemm_multi_d test_grouped_gemm_multi_d.cpp)
     target_compile_options(test_ck_tile_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()
diff --git a/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d.cpp b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d.cpp
index 65c662199b..8d56c274aa 100644
--- a/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d.cpp
+++ b/test/ck_tile/grouped_gemm_multi_d/test_grouped_gemm_multi_d.cpp
@@ -29,9 +29,6 @@ template <typename ALayout_,
           int M_Warp_val_,
           int N_Warp_val_,
           int K_Warp_val_,
-          int M_Warp_Tile_val_,
-          int N_Warp_Tile_val_,
-          int K_Warp_Tile_val_,
           bool DoubleSmemBuffer_val_,
           ck_tile::GemmPipelineScheduler Scheduler_val_,
           PipelineType Pipeline_val_,
@@ -50,15 +47,21 @@ struct KernelConfig
     using EDataType    = EDataType_;
     using DsDataType   = ck_tile::tuple<D0DataType_, D1DataType_>;
 
-    static constexpr int M_Tile_            = M_Tile_val_;
-    static constexpr int N_Tile_            = N_Tile_val_;
-    static constexpr int K_Tile_            = K_Tile_val_;
-    static constexpr int M_Warp_            = M_Warp_val_;
-    static constexpr int N_Warp_            = N_Warp_val_;
-    static constexpr int K_Warp_            = K_Warp_val_;
-    static constexpr int M_Warp_Tile_       = M_Warp_Tile_val_;
-    static constexpr int N_Warp_Tile_       = N_Warp_Tile_val_;
-    static constexpr int K_Warp_Tile_       = K_Warp_Tile_val_;
+    static constexpr int M_Tile_ = M_Tile_val_;
+    static constexpr int N_Tile_ = N_Tile_val_;
+    static constexpr int K_Tile_ = K_Tile_val_;
+    static constexpr int M_Warp_ = M_Warp_val_;
+    static constexpr int N_Warp_ = N_Warp_val_;
+    static constexpr int K_Warp_ = K_Warp_val_;
+#if CK_TILE_USE_WMMA
+    static constexpr int M_Warp_Tile_ = 16;
+    static constexpr int N_Warp_Tile_ = 16;
+    static constexpr int K_Warp_Tile_ = 16;
+#else
+    static constexpr int M_Warp_Tile_ = 32;
+    static constexpr int N_Warp_Tile_ = 32;
+    static constexpr int K_Warp_Tile_ = (M_Warp_val_ == 2) ? 16 : 8;
+#endif
     static constexpr bool DoubleSmemBuffer_ = DoubleSmemBuffer_val_;
     static constexpr auto Scheduler_        = Scheduler_val_;
     static constexpr PipelineType Pipeline_ = Pipeline_val_;
@@ -68,21 +71,21 @@ struct KernelConfig
 
 // clang-format off
 using KernelTypes = ::testing::Types<
-    //             ALayout, BLayout, ELayout, ADataType, BDataType, D0DataType, D1DataType, AccDataType, EDataType, M_N_KTiles,    M_N_K_Warps,     M_N_K_Warp_Tile, DoubleSmemBuffer, Scheduler, Pipeline, Persistent
+    //             ALayout, BLayout, ELayout, ADataType, BDataType, D0DataType, D1DataType, AccDataType, EDataType, M_N_KTiles,    M_N_K_Warps,     DoubleSmemBuffer, Scheduler, Pipeline, Persistent
     // FP16 A/B/D/E
-    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 32, 64,    4, 1, 1,       32, 32, 8,        false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, false>, // memory
-    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 32, 64,    4, 1, 1,       32, 32, 8,        false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, true>, // memory
-    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  256, 256, 64,   2, 2, 1,       32, 32, 16,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, false>, // v3
-    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  256, 256, 64,   2, 2, 1,       32, 32, 16,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, true>, // v3
-    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  256, 256, 32,   2, 2, 1,       32, 32, 16,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, false>, // v4
-    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  256, 256, 32,   2, 2, 1,       32, 32, 16,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, true>, // v4
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 32, 64,    4, 1, 1,       false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, false>, // memory
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 32, 64,    4, 1, 1,       false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, true>, // memory
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 128, 64,   2, 2, 1,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, false>, // v3
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 128, 64,   2, 2, 1,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, true>, // v3
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 128, 32,   2, 2, 1,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, false>, // v4
+    KernelConfig<    Row,     Col,     Row,         F16,       F16,        F16,        F16,         F32,       F16,  128, 128, 32,   2, 2, 1,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, true>, // v4
     // BF16 A/B/D/E
-    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 32, 64,    4, 1, 1,       32, 32, 8,        false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, false>, // memory
-    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 32, 64,    4, 1, 1,       32, 32, 8,        false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, true>, // memory
-    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  256, 256, 64,   2, 2, 1,       32, 32, 16,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, false>, // v3
-    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  256, 256, 64,   2, 2, 1,       32, 32, 16,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, true>, // v3
-    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  256, 256, 32,   2, 2, 1,       32, 32, 16,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, false>, // v4
-    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  256, 256, 32,   2, 2, 1,       32, 32, 16,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, true> // v4
+    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 32, 64,    4, 1, 1,       false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, false>, // memory
+    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 32, 64,    4, 1, 1,       false,           ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, true>, // memory
+    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 128, 64,   2, 2, 1,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, false>, // v3
+    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 128, 64,   2, 2, 1,       false,           ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, true>, // v3
+    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 128, 32,   2, 2, 1,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, false>, // v4
+    KernelConfig<    Row,     Col,     Row,        BF16,      BF16,       BF16,       BF16,         F32,      BF16,  128, 128, 32,   2, 2, 1,       true,            ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, true> // v4
     >;
 // clang-format on
 
diff --git a/test/ck_tile/grouped_gemm_preshuffle/CMakeLists.txt b/test/ck_tile/grouped_gemm_preshuffle/CMakeLists.txt
index 08b413aea9..3a230aed0c 100644
--- a/test/ck_tile/grouped_gemm_preshuffle/CMakeLists.txt
+++ b/test/ck_tile/grouped_gemm_preshuffle/CMakeLists.txt
@@ -6,7 +6,7 @@ if(CK_USE_OCP_FP8)
     list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
 
-if(GPU_TARGETS MATCHES "gfx94|gfx95")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     add_gtest_executable(test_ck_tile_grouped_gemm_preshuffle test_grouped_gemm_preshuffle.cpp)
     target_compile_options(test_ck_tile_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()
diff --git a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle.cpp b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle.cpp
index 623d0152d6..450b7b8f24 100644
--- a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle.cpp
+++ b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle.cpp
@@ -50,16 +50,16 @@ struct KernelConfig
 // clang-format off
 using KernelTypes = ::testing::Types<
     //               ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, Persistent ,M_Tile, N_Tile, K_Tile, BlockPerCu
-    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   False,    16,     64,    256,         1>,
+#if !CK_TILE_USE_WMMA || CK_TILE_USE_OCP_FP8
     KernelConfig<    Row,     Col,     Row,       F8,        F8,          F32,       F16,   False,    16,     64,    256,         1>,
-    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   False,  128,    128,    128,         2>,
     KernelConfig<    Row,     Col,     Row,       F8,        F8,          F32,       F16,   False,   128,    128,    128,         2>,
-
-    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   True,    16,     64,    256,         1>,
     KernelConfig<    Row,     Col,     Row,       F8,        F8,          F32,       F16,   True,    16,     64,    256,         1>,
-    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   True,  128,    128,    128,         2>,
     KernelConfig<    Row,     Col,     Row,       F8,        F8,          F32,       F16,   True,   128,    128,    128,         2>,
-
+#endif
+    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   False,    16,     64,    256,         1>,
+    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   False,  128,    128,    128,         2>,
+    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   True,    16,     64,    256,         1>,
+    KernelConfig<    Row,     Col,     Row,       F16,       F16,         F32,       F16,   True,  128,    128,    128,         2>,
     KernelConfig<    Row,     Col,     Row,       BF16,      BF16,         F32,       BF16,   False,    16,     64,    256,         1>,
     KernelConfig<    Row,     Col,     Row,       BF16,      BF16,         F32,       BF16,   False,    16,     64,    256,         1>,
     KernelConfig<    Row,     Col,     Row,       BF16,      BF16,         F32,       BF16,   False,  128,    128,    128,         2>,
diff --git a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_util.hpp b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_util.hpp
index 0eb388082b..5628b6feae 100644
--- a/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_util.hpp
+++ b/test/ck_tile/grouped_gemm_preshuffle/test_grouped_gemm_preshuffle_util.hpp
@@ -14,6 +14,9 @@
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile_flatmm()
 {
+#if CK_TILE_USE_WMMA
+    return 16;
+#else
 #if defined(CK_GFX950_SUPPORT)
     if constexpr(M_Warp_Tile == 32)
         return sizeof(PrecType) == 2 ? 16 : 64;
@@ -25,6 +28,7 @@ constexpr ck_tile::index_t get_k_warp_tile_flatmm()
     else
         return sizeof(PrecType) == 2 ? 32 : 64;
 #endif
+#endif
 }
 
 template <typename Tuple>
@@ -101,13 +105,40 @@ class TestCkTileGroupedGemmPreshuffle : public ::testing::Test
     auto shuffle_b(const ck_tile::HostTensor<T>& t)
     {
         assert(t.get_lengths().size() == 2);
-        int n_                = t.get_lengths()[1];
-        int k_                = t.get_lengths()[0];
-        constexpr int divisor = N_Warp_Tile == 32 ? 2 : 4;
-        ck_tile::HostTensor<T> t_view(
-            {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
-        std::copy(t.begin(), t.end(), t_view.begin());
-        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+        int n_ = t.get_lengths()[1];
+        int k_ = t.get_lengths()[0];
+
+        if(ck_tile::is_gfx12_supported())
+        {
+            constexpr int divisor      = 2;
+            constexpr int kABK1PerLane = 8;
+            constexpr int kABK0PerLane = K_Warp_Tile / divisor / kABK1PerLane;
+            ck_tile::HostTensor<T> t_view({n_ / N_Warp_Tile,
+                                           N_Warp_Tile,
+                                           k_ / K_Warp_Tile,
+                                           kABK0PerLane,
+                                           divisor,
+                                           kABK1PerLane});
+            std::copy(t.begin(), t.end(), t_view.begin());
+            return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+        }
+        else
+        {
+            int divisor = 1;
+            if(ck_tile::is_gfx11_supported())
+            {
+                divisor = 1;
+            }
+            else
+            {
+                assert(is_wave32() == false);
+                divisor = N_Warp_Tile == 32 ? 2 : 4;
+            }
+            ck_tile::HostTensor<T> t_view(
+                {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
+            std::copy(t.begin(), t.end(), t_view.begin());
+            return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+        }
     }
 
     template <typename ALayout, typename BLayout, typename CLayout>
@@ -115,6 +146,11 @@ class TestCkTileGroupedGemmPreshuffle : public ::testing::Test
                              const ck_tile::stream_config& s,
                              void* kargs_ptr)
     {
+        constexpr ck_tile::index_t WaveSize     = 32;
+        constexpr ck_tile::index_t MIterPerWarp = M_Tile / (M_Warp * M_Warp_Tile);
+        constexpr bool SupportVectorSize16 =
+            (M_Warp_Tile * K_Warp_Tile * sizeof(ADataType) * MIterPerWarp / WaveSize) % 16 == 0;
+        constexpr int VectorSize = SupportVectorSize16 ? 16 : 8;
 
         using GemmShape =
             ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
@@ -137,7 +173,8 @@ class TestCkTileGroupedGemmPreshuffle : public ::testing::Test
                                              /*UseStructuredSparsity*/ false,
                                              /*Persistent*/ false,
                                              /*NumWaveGroups*/ 1,
-                                             /*Preshuffle*/ true>;
+                                             /*Preshuffle*/ true,
+                                             VectorSize>;
 
         using UniversalGemmProblem =
             ck_tile::UniversalGemmPipelineProblem<ADataType,
@@ -210,6 +247,12 @@ class TestCkTileGroupedGemmPreshuffle : public ::testing::Test
                                         const ck_tile::stream_config& s,
                                         void* kargs_ptr)
     {
+        constexpr ck_tile::index_t WaveSize     = 32;
+        constexpr ck_tile::index_t MIterPerWarp = M_Tile / (M_Warp * M_Warp_Tile);
+        constexpr bool SupportVectorSize16 =
+            (M_Warp_Tile * K_Warp_Tile * sizeof(ADataType) * MIterPerWarp / WaveSize) % 16 == 0;
+        constexpr int VectorSize = SupportVectorSize16 ? 16 : 8;
+
         using GemmShape =
             ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                    ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
@@ -230,7 +273,8 @@ class TestCkTileGroupedGemmPreshuffle : public ::testing::Test
                                              /*UseStructuredSparsity*/ false,
                                              /*Persistent*/ true, // Enable persistent mode
                                              /*NumWaveGroups*/ 1,
-                                             /*Preshuffle*/ true>;
+                                             /*Preshuffle*/ true,
+                                             VectorSize>;
 
         using UniversalGemmProblem =
             ck_tile::UniversalGemmPipelineProblem<ADataType,

From b4a34371a6a075fd00e22cf589f683de5f9271e3 Mon Sep 17 00:00:00 2001
From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com>
Date: Fri, 12 Dec 2025 19:26:47 +0100
Subject: [PATCH 03/10] Fix compilation ab scale multi target (#3413)

---
 .../gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp     | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp
index ac5b7dd0c4..0974f45a2b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp
@@ -527,11 +527,6 @@ struct GridwiseGemm_wmma_cshuffle_v3_ab_scale
         }
         else
         {
-#if defined(__gfx11__)
-            // TODO: remove this restriction
-            static_assert(ScaleBlockM >= MPerWmma,
-                          "ScaleBlockM must be greater equal than MPerWmma");
-#endif
             static_assert(
                 ScaleBlockK >=
                     WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>::

From 9707ddb444f42b490c73b7884babccde2988ed7e Mon Sep 17 00:00:00 2001
From: Cong Ma <142121551+CongMa13@users.noreply.github.com>
Date: Fri, 12 Dec 2025 17:08:26 -0700
Subject: [PATCH 04/10] [CK TILE GEMM STREAMK] update identifier names
 according to the new code style (#3348)

* [CK TILE GEMM STREAMK] update identifier names according to the new code style
---
 .../ck_tile/40_streamk_gemm/gemm_utils.hpp    |  56 +--
 .../40_streamk_gemm/run_gemm_example.inc      | 380 +++++++++---------
 .../40_streamk_gemm/streamk_gemm_basic.cpp    | 204 +++++-----
 3 files changed, 328 insertions(+), 312 deletions(-)

diff --git a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
index dad31ec637..34c6c6b0ae 100644
--- a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
+++ b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
@@ -7,46 +7,46 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
-struct GemmConfigBase
+struct GemmConfigurationBase
 {
-    static constexpr bool kPadM = true;
-    static constexpr bool kPadN = true;
-    static constexpr bool kPadK = true;
+    static constexpr bool PAD_M = true;
+    static constexpr bool PAD_N = true;
+    static constexpr bool PAD_K = true;
 
-    static constexpr bool PermuteA = false;
-    static constexpr bool PermuteB = false;
+    static constexpr bool PERMUTE_A = false;
+    static constexpr bool PERMUTE_B = false;
 
-    static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
+    static constexpr bool TRANSPOSE_C             = false;
+    static constexpr bool USE_STRUCTURED_SPARSITY = false;
 
-    static constexpr int kBlockPerCu                = 1;
-    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
-    static constexpr ck_tile::index_t NumWaveGroups = 1;
-    static constexpr bool Preshuffle                = false;
-    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr int BLOCK_PER_CU                 = 1;
+    static constexpr auto SCHEDULER                   = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t NUM_WAVE_GROUPS = 1;
+    static constexpr bool PRESHUFFLE                  = false;
+    static constexpr bool DOUBLE_SMEM_BUFFER          = false;
 };
 
-template <typename PrecType, bool Persistent_>
-struct GemmConfigMemoryInterwave : public GemmConfigBase
+template <typename PrecisionType, bool IsPersistent>
+struct GemmConfigurationMemoryInterwave : public GemmConfigurationBase
 {
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 16;
+    static constexpr ck_tile::index_t M_TILE = 256;
+    static constexpr ck_tile::index_t N_TILE = 256;
+    static constexpr ck_tile::index_t K_TILE = 16;
 
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
+    static constexpr ck_tile::index_t M_WARP = 2;
+    static constexpr ck_tile::index_t N_WARP = 2;
+    static constexpr ck_tile::index_t K_WARP = 1;
 
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+    static constexpr ck_tile::index_t M_WARP_TILE = 32;
+    static constexpr ck_tile::index_t N_WARP_TILE = 32;
+    static constexpr ck_tile::index_t K_WARP_TILE = sizeof(PrecisionType) == 2 ? 8 : 16;
 
-    static constexpr bool Persistent = Persistent_;
-    static constexpr auto Scheduler  = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr bool PERSISTENT = IsPersistent;
+    static constexpr auto SCHEDULER  = ck_tile::GemmPipelineScheduler::Intrawave;
 };
 
 template <typename ADataType_, typename BDataType_ = ADataType_, typename CDataType_ = ADataType_>
-struct StreamKGemmTypeConfig
+struct StreamKGemmTypeConfiguration
 {
     using ADataType   = ADataType_;
     using BDataType   = BDataType_;
@@ -54,7 +54,7 @@ struct StreamKGemmTypeConfig
     using CDataType   = CDataType_;
 };
 
-auto create_args(int argc, char* argv[])
+auto createArgs(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("m", "512", "m dimension")
diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
index d18ac2e68a..7442bd33f2 100644
--- a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
+++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
@@ -12,31 +12,35 @@ static constexpr inline auto is_row_major(Layout)
 }
 
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
+auto calculateRtolAtol(const ck_tile::index_t k_dim,
+                       const ck_tile::index_t k_batch,
+                       const float max_accumulated_value)
 {
     using ComputeType =
         std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
     // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    const auto relative_tolerance =
+        ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+            ck_tile::integer_divide_ceil(k_dim, k_batch));
+    const auto absolute_tolerance =
+        ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+            max_accumulated_value / k_batch, ck_tile::integer_divide_ceil(k_dim, k_batch));
     // Calculate error due to multiple WGs working in the same C macro tile
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
+    const auto relative_tolerance_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(k_batch);
+    const auto absolute_tolerance_split_k =
+        ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(max_accumulated_value,
+                                                                         k_batch);
     // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+    return ck_tile::make_tuple(std::max(relative_tolerance, relative_tolerance_split_k),
+                               std::max(absolute_tolerance, absolute_tolerance_split_k));
 }
 
-template <typename GemmConfig,
+template <typename GemmConfiguration,
           typename ADataType,
           typename BDataType,
           typename DsDataType,
-          typename AccDataType,
+          typename AccumulatorDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
@@ -45,102 +49,107 @@ template <typename GemmConfig,
           typename CDEElementWise = ck_tile::element_wise::PassThrough,
           ck_tile::StreamKReductionStrategy ReductionStrategy>
 std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
-                                         const ck_tile::stream_config& s);
+                                         const ck_tile::stream_config& stream_config);
 
-template <typename GemmConfig,
+template <typename GemmConfiguration,
           typename ADataType,
           typename BDataType,
           typename DsDataType,
-          typename AccDataType,
+          typename AccumulatorDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
           typename DsLayout,
           typename CLayout,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
-std::tuple<float, ck_tile::index_t>
-invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
-            ck_tile::DeviceMem& b_k_n_dev_buf,
-            ck_tile::DeviceMem& c_m_n_dev_buf,
-            ck_tile::index_t M,
-            ck_tile::index_t N,
-            ck_tile::index_t K,
-            ck_tile::index_t stride_A,
-            ck_tile::index_t stride_B,
-            ck_tile::index_t stride_C,
-            int n_warmup,
-            int n_repeat,
-            bool flush_cache,
-            ck_tile::StreamKReductionStrategy reduction_strategy)
+std::tuple<float, ck_tile::index_t> invokeGemm(ck_tile::DeviceMem& a_m_k_device_memory,
+                                               ck_tile::DeviceMem& b_k_n_device_memory,
+                                               ck_tile::DeviceMem& c_m_n_device_memory,
+                                               ck_tile::index_t m_dim,
+                                               ck_tile::index_t n_dim,
+                                               ck_tile::index_t k_dim,
+                                               ck_tile::index_t stride_a,
+                                               ck_tile::index_t stride_b,
+                                               ck_tile::index_t stride_c,
+                                               int warmup_iterations,
+                                               int repeat_iterations,
+                                               bool flush_cache,
+                                               ck_tile::StreamKReductionStrategy reduction_strategy)
 {
-    ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
-                                  b_k_n_dev_buf.GetDeviceBuffer(),
-                                  c_m_n_dev_buf.GetDeviceBuffer(),
-                                  M,
-                                  N,
-                                  K,
-                                  stride_A,
-                                  stride_B,
-                                  stride_C};
+    ck_tile::StreamKHostArgs args{a_m_k_device_memory.GetDeviceBuffer(),
+                                  b_k_n_device_memory.GetDeviceBuffer(),
+                                  c_m_n_device_memory.GetDeviceBuffer(),
+                                  m_dim,
+                                  n_dim,
+                                  k_dim,
+                                  stride_a,
+                                  stride_b,
+                                  stride_c};
 
-    std::tuple<float, ck_tile::index_t> ave_time_and_batch;
+    std::tuple<float, ck_tile::index_t> average_time_and_batch;
 
     if(reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)
     {
-        ave_time_and_batch = gemm<GemmConfig,
-                                  ADataType,
-                                  BDataType,
-                                  DsDataType,
-                                  AccDataType,
-                                  CDataType,
-                                  ALayout,
-                                  BLayout,
-                                  DsLayout,
-                                  CLayout,
-                                  CDEElementWise,
-                                  ck_tile::StreamKReductionStrategy::Atomic>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, flush_cache});
+        average_time_and_batch = gemm<GemmConfiguration,
+                                      ADataType,
+                                      BDataType,
+                                      DsDataType,
+                                      AccumulatorDataType,
+                                      CDataType,
+                                      ALayout,
+                                      BLayout,
+                                      DsLayout,
+                                      CLayout,
+                                      CDEElementWise,
+                                      ck_tile::StreamKReductionStrategy::Atomic>(
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, warmup_iterations, repeat_iterations, true, flush_cache});
     }
     else /*Reduction*/
     {
-        ave_time_and_batch = gemm<GemmConfig,
-                                  ADataType,
-                                  BDataType,
-                                  DsDataType,
-                                  AccDataType,
-                                  CDataType,
-                                  ALayout,
-                                  BLayout,
-                                  DsLayout,
-                                  CLayout,
-                                  CDEElementWise,
-                                  ck_tile::StreamKReductionStrategy::Reduction>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, flush_cache});
+        average_time_and_batch = gemm<GemmConfiguration,
+                                      ADataType,
+                                      BDataType,
+                                      DsDataType,
+                                      AccumulatorDataType,
+                                      CDataType,
+                                      ALayout,
+                                      BLayout,
+                                      DsLayout,
+                                      CLayout,
+                                      CDEElementWise,
+                                      ck_tile::StreamKReductionStrategy::Reduction>(
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, warmup_iterations, repeat_iterations, true, flush_cache});
     }
 
-    return ave_time_and_batch;
+    return average_time_and_batch;
 }
 
 template <typename CDataType>
-bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-               const ck_tile::HostTensor<CDataType>& c_m_n_ref,
-               const ck_tile::tuple<double, double>& rtol_atol,
-               const char* variant)
+bool doVerify(const ck_tile::HostTensor<CDataType>& c_m_n_device_result,
+              const ck_tile::HostTensor<CDataType>& c_m_n_reference,
+              const ck_tile::tuple<double, double>& relative_absolute_tolerances,
+              const char* variant)
 {
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_ref,
+    bool pass = ck_tile::check_err(c_m_n_device_result,
+                                   c_m_n_reference,
                                    "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
+                                   relative_absolute_tolerances.at(ck_tile::number<0>{}),
+                                   relative_absolute_tolerances.at(ck_tile::number<1>{}));
 
-    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "Relative error threshold: "
+              << relative_absolute_tolerances.at(ck_tile::number<0>{})
+              << " Absolute error threshold: "
+              << relative_absolute_tolerances.at(ck_tile::number<1>{}) << std::endl;
     std::cout << "The " << variant << " verification result is:" << (pass ? "correct" : "fail")
               << std::endl;
     return pass;
 }
 
-ck_tile::StreamKReductionStrategy get_reduction_strategy_value(const std::string& strategy)
+ck_tile::StreamKReductionStrategy getReductionStrategyValue(const std::string& strategy)
 {
     if(strategy == "atomic")
     {
@@ -156,172 +165,169 @@ ck_tile::StreamKReductionStrategy get_reduction_strategy_value(const std::string
     }
 }
 
-template <typename GemmConfig,
-          typename TypeConfig,
+template <typename GemmConfiguration,
+          typename TypeConfiguration,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-int run_gemm_example_with_layouts(int argc,
-                                  char* argv[],
-                                  const ALayout a_layout                  = ALayout{},
-                                  const BLayout b_layout                  = BLayout{},
-                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
+int runGemmExampleWithLayouts(int argc,
+                              char* argv[],
+                              const ALayout a_layout                  = ALayout{},
+                              const BLayout b_layout                  = BLayout{},
+                              [[maybe_unused]] const CLayout c_layout = CLayout{})
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    auto [result, arg_parser] = createArgs(argc, argv);
     if(!result)
         return -1;
 
-    static_assert(!GemmConfig::Preshuffle, "Not implemented");
-    static_assert(!GemmConfig::UseStructuredSparsity, "Not implemented");
-    static_assert(!GemmConfig::PermuteA, "Not implemented");
-    static_assert(!GemmConfig::PermuteB, "Not implemented");
+    static_assert(!GemmConfiguration::PRESHUFFLE, "Not implemented");
+    static_assert(!GemmConfiguration::USE_STRUCTURED_SPARSITY, "Not implemented");
+    static_assert(!GemmConfiguration::PERMUTE_A, "Not implemented");
+    static_assert(!GemmConfiguration::PERMUTE_B, "Not implemented");
 
-    using ADataType   = typename TypeConfig::ADataType;
-    using BDataType   = typename TypeConfig::BDataType;
-    using AccDataType = typename TypeConfig::AccDataType;
-    using CDataType   = typename TypeConfig::CDataType;
+    using ADataType           = typename TypeConfiguration::ADataType;
+    using BDataType           = typename TypeConfiguration::BDataType;
+    using AccumulatorDataType = typename TypeConfiguration::AccDataType;
+    using CDataType           = typename TypeConfiguration::CDataType;
 
-    ck_tile::index_t M = arg_parser.get_int("m");
-    ck_tile::index_t N = arg_parser.get_int("n");
-    ck_tile::index_t K = arg_parser.get_int("k");
+    ck_tile::index_t m_dim = arg_parser.get_int("m");
+    ck_tile::index_t n_dim = arg_parser.get_int("n");
+    ck_tile::index_t k_dim = arg_parser.get_int("k");
 
-    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
-    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
-    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
-
-    int n_warmup                 = arg_parser.get_int("warmup");
-    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t stride_a    = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_b    = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_c    = arg_parser.get_int("stride_c");
+    int warmup_iterations        = arg_parser.get_int("warmup");
+    int repeat_iterations        = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
     bool flush_cache             = arg_parser.get_bool("flush_cache");
-
     ck_tile::StreamKReductionStrategy reduction_strategy =
-        get_reduction_strategy_value(arg_parser.get_str("reduction_strategy"));
+        getReductionStrategyValue(arg_parser.get_str("reduction_strategy"));
 
-    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
-    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
-    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+    stride_a = ck_tile::get_default_stride(m_dim, k_dim, stride_a, is_row_major(a_layout));
+    stride_b = ck_tile::get_default_stride(k_dim, n_dim, stride_b, is_row_major(b_layout));
+    stride_c = ck_tile::get_default_stride(m_dim, n_dim, stride_c, is_row_major(CLayout{}));
 
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+    ck_tile::HostTensor<ADataType> a_m_k_host(
+        ck_tile::host_tensor_descriptor(m_dim, k_dim, stride_a, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n_host(
+        ck_tile::host_tensor_descriptor(k_dim, n_dim, stride_b, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_device_result(
+        ck_tile::host_tensor_descriptor(m_dim, n_dim, stride_c, is_row_major(CLayout{})));
 
     if(init_method == 0)
     {
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_host);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_host);
     }
     else if(init_method == 1)
     {
-        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
-        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k_host);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n_host);
     }
     else if(init_method == 2)
     {
-        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k_host);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n_host);
     }
     else
     {
-        a_m_k.SetZero();
-        b_k_n.SetZero();
+        a_m_k_host.SetZero();
+        b_k_n_host.SetZero();
     }
 
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem a_m_k_device_memory(a_m_k_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_device_memory(b_k_n_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_device_memory(c_m_n_device_result.get_element_space_size_in_bytes());
 
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    b_k_n_dev_buf.ToDevice(b_k_n.data());
-    c_m_n_dev_buf.SetZero();
-    c_m_n_dev_result.SetZero();
+    a_m_k_device_memory.ToDevice(a_m_k_host.data());
+    b_k_n_device_memory.ToDevice(b_k_n_host.data());
+    c_m_n_device_memory.SetZero();
+    c_m_n_device_result.SetZero();
+    auto [average_time, num_wgs_per_tile] = invokeGemm<GemmConfiguration,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ck_tile::tuple<>,
+                                                       AccumulatorDataType,
+                                                       CDataType,
+                                                       ALayout,
+                                                       BLayout,
+                                                       ck_tile::tuple<>,
+                                                       CLayout>(a_m_k_device_memory,
+                                                                b_k_n_device_memory,
+                                                                c_m_n_device_memory,
+                                                                m_dim,
+                                                                n_dim,
+                                                                k_dim,
+                                                                stride_a,
+                                                                stride_b,
+                                                                stride_c,
+                                                                warmup_iterations,
+                                                                repeat_iterations,
+                                                                flush_cache,
+                                                                reduction_strategy);
 
-    auto [ave_time, num_wgs_per_tile] = invoke_gemm<GemmConfig,
-                                                    ADataType,
-                                                    BDataType,
-                                                    ck_tile::tuple<>,
-                                                    AccDataType,
-                                                    CDataType,
-                                                    ALayout,
-                                                    BLayout,
-                                                    ck_tile::tuple<>,
-                                                    CLayout>(a_m_k_dev_buf,
-                                                             b_k_n_dev_buf,
-                                                             c_m_n_dev_buf,
-                                                             M,
-                                                             N,
-                                                             K,
-                                                             stride_A,
-                                                             stride_B,
-                                                             stride_C,
-                                                             n_warmup,
-                                                             n_repeat,
-                                                             flush_cache,
-                                                             reduction_strategy);
+    c_m_n_device_memory.FromDevice(c_m_n_device_result.data());
 
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_byte =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
-              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+    std::size_t flop     = std::size_t(2) * m_dim * n_dim * k_dim;
+    std::size_t num_byte = sizeof(ADataType) * m_dim * k_dim + sizeof(BDataType) * n_dim * k_dim +
+                           sizeof(CDataType) * m_dim * n_dim;
+    float tflops     = static_cast<float>(flop) / 1.E9 / average_time;
+    float gb_per_sec = num_byte / 1.E6 / average_time;
+    std::cout << "Run Gemm kernel with M=" << m_dim << " N=" << n_dim << " K=" << k_dim
+              << " StrideA=" << stride_a << " StrideB=" << stride_b << " StrideC=" << stride_c
               << " A_Layout=" << ALayout::name << " B_Layout=" << BLayout::name
               << " C_Layout=" << CLayout::name
               << " A_Type=" << ck_tile::DataTypeTraits<ADataType>::name
               << " B_Type=" << ck_tile::DataTypeTraits<BDataType>::name
               << " C_Type=" << ck_tile::DataTypeTraits<CDataType>::name
               << " reduction_strategy=" << arg_parser.get_str("reduction_strategy") << " "
-              << " persistent_dp=" << arg_parser.get_str("persistent_dp") << " " << ave_time
+              << " persistent_dp=" << arg_parser.get_str("persistent_dp") << " " << average_time
               << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
     bool pass = false;
 
     // Memory on host to store gpu reference result
-    ck_tile::HostTensor<CDataType> c_m_n_ref(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-    c_m_n_ref.SetZero();
+    ck_tile::HostTensor<CDataType> c_m_n_reference(
+        ck_tile::host_tensor_descriptor(m_dim, n_dim, stride_c, is_row_major(CLayout{})));
+    c_m_n_reference.SetZero();
 
     if(arg_parser.get_int("v") == 1) // Validate on the CPU
     {
-        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_ref);
+        ck_tile::reference_gemm<ADataType, BDataType, AccumulatorDataType, CDataType>(
+            a_m_k_host, b_k_n_host, c_m_n_reference);
         const float max_accumulated_value =
-            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, num_wgs_per_tile, max_accumulated_value);
-        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU");
+            *std::max_element(c_m_n_reference.mData.begin(), c_m_n_reference.mData.end());
+        const auto relative_absolute_tolerances =
+            calculateRtolAtol<ADataType, BDataType, AccumulatorDataType, CDataType>(
+                k_dim, num_wgs_per_tile, max_accumulated_value);
+        pass = doVerify(c_m_n_device_result, c_m_n_reference, relative_absolute_tolerances, "CPU");
     }
     else if(arg_parser.get_int("v") == 2) // Validate on the GPU
     {
         // Memory on device to store gpu reference result
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes());
-        c_m_n_gpu_buf_ref.SetZero();
-
-        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
-        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
-        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+        ck_tile::DeviceMem c_m_n_gpu_buffer_reference(
+            c_m_n_reference.get_element_space_size_in_bytes());
+        c_m_n_gpu_buffer_reference.SetZero();
 
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_device_memory.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_device_memory.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buffer_reference.GetDeviceBuffer());
         ck_tile::reference_gemm_gpu<ADataType,
                                     BDataType,
-                                    AccDataType,
+                                    AccumulatorDataType,
                                     CDataType,
                                     ALayout,
                                     BLayout,
-                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
-
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data());
+                                    CLayout>(
+            d_A, d_B, d_C, m_dim, n_dim, k_dim, stride_a, stride_b, stride_c);
+        c_m_n_gpu_buffer_reference.FromDevice(c_m_n_reference.data());
 
         const float max_accumulated_value =
-            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, num_wgs_per_tile, max_accumulated_value);
-        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU");
+            *std::max_element(c_m_n_reference.mData.begin(), c_m_n_reference.mData.end());
+        const auto relative_absolute_tolerances =
+            calculateRtolAtol<ADataType, BDataType, AccumulatorDataType, CDataType>(
+                k_dim, num_wgs_per_tile, max_accumulated_value);
+        pass = doVerify(c_m_n_device_result, c_m_n_reference, relative_absolute_tolerances, "GPU");
     }
 
     return pass;
diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
index 83795fbf6a..d3ee9fe9c6 100644
--- a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
+++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
@@ -4,11 +4,11 @@
 #include "gemm_utils.hpp"
 #include "ck_tile/ops/common.hpp"
 
-template <typename GemmConfig,
+template <typename GemmConfiguration,
           typename ADataType,
           typename BDataType,
           typename DsDataType,
-          typename AccDataType,
+          typename AccumulatorDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
@@ -17,43 +17,49 @@ template <typename GemmConfig,
           typename CDEElementWise,
           ck_tile::StreamKReductionStrategy ReductionStrategy>
 std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
-                                         const ck_tile::stream_config& s)
+                                         const ck_tile::stream_config& stream_config)
 {
-    using GemmShape = ck_tile::TileGemmShape<
-        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-        ck_tile::
-            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-        GemmConfig::PermuteA,
-        GemmConfig::PermuteB>;
+    using GemmShape = ck_tile::TileGemmShape<ck_tile::sequence<GemmConfiguration::M_TILE,
+                                                               GemmConfiguration::N_TILE,
+                                                               GemmConfiguration::K_TILE>,
+                                             ck_tile::sequence<GemmConfiguration::M_WARP,
+                                                               GemmConfiguration::N_WARP,
+                                                               GemmConfiguration::K_WARP>,
+                                             ck_tile::sequence<GemmConfiguration::M_WARP_TILE,
+                                                               GemmConfiguration::N_WARP_TILE,
+                                                               GemmConfiguration::K_WARP_TILE>,
+                                             GemmConfiguration::PERMUTE_A,
+                                             GemmConfiguration::PERMUTE_B>;
 
-    using TilePartitioner =
-        ck_tile::StreamKTilePartitioner<GemmShape, ReductionStrategy, GemmConfig::Persistent>;
+    using TilePartitioner = ck_tile::
+        StreamKTilePartitioner<GemmShape, ReductionStrategy, GemmConfiguration::PERSISTENT>;
 
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
-                                                                 GemmConfig::kPadN,
-                                                                 GemmConfig::kPadK,
-                                                                 GemmConfig::DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 ELayout,
-                                                                 GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity,
-                                                                 GemmConfig::Persistent,
-                                                                 GemmConfig::NumWaveGroups,
-                                                                 GemmConfig::Preshuffle>;
+    using GemmUniversalTraits =
+        ck_tile::TileGemmUniversalTraits<GemmConfiguration::PAD_M,
+                                         GemmConfiguration::PAD_N,
+                                         GemmConfiguration::PAD_K,
+                                         GemmConfiguration::DOUBLE_SMEM_BUFFER,
+                                         ALayout,
+                                         BLayout,
+                                         ELayout,
+                                         GemmConfiguration::TRANSPOSE_C,
+                                         GemmConfiguration::USE_STRUCTURED_SPARSITY,
+                                         GemmConfiguration::PERSISTENT,
+                                         GemmConfiguration::NUM_WAVE_GROUPS,
+                                         GemmConfiguration::PRESHUFFLE>;
 
-    const auto Run = [&](const auto memory_operation) -> std::tuple<float, ck_tile::index_t> {
+    const auto runKernel = [&](const auto memory_operation) -> std::tuple<float, ck_tile::index_t> {
         // We create the GEMM pipeline without specifying has_hot_loop or tail_num.
         // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K
         // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K
         // Kernel's RunGemm function. This is a similar pattern used by grouped GEMM.
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           GemmConfig::Scheduler>;
+        using UniversalGemmProblem =
+            ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                  BDataType,
+                                                  AccumulatorDataType,
+                                                  GemmShape,
+                                                  GemmUniversalTraits,
+                                                  GemmConfiguration::SCHEDULER>;
 
         using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
 
@@ -61,39 +67,39 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
                                              DsDataType,
-                                             AccDataType,
+                                             AccumulatorDataType,
                                              CDataType,
                                              DsLayout,
                                              ELayout,
                                              CDEElementWise,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
+                                             GemmConfiguration::M_WARP,
+                                             GemmConfiguration::N_WARP,
+                                             GemmConfiguration::M_WARP_TILE,
+                                             GemmConfiguration::N_WARP_TILE,
+                                             GemmConfiguration::K_WARP_TILE,
                                              UniversalGemmProblem::TransposeC,
                                              memory_operation.value,
-                                             GemmConfig::NumWaveGroups>>;
+                                             GemmConfiguration::NUM_WAVE_GROUPS>>;
 
         using Kernel = ck_tile::StreamKKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
 
-        auto kargs                = Kernel::MakeKernelArgs(args);
-        const auto workspace_size = Kernel::GetWorkSpaceSize(kargs);
+        auto kernel_args          = Kernel::MakeKernelArgs(args);
+        const auto workspace_size = Kernel::GetWorkSpaceSize(kernel_args);
         ck_tile::DeviceMem workspace_data(workspace_size);
         workspace_data.SetZero();
-        kargs.workspace_ptr = workspace_data.GetDeviceBuffer();
+        kernel_args.workspace_ptr = workspace_data.GetDeviceBuffer();
 
-        dim3 grids  = Kernel::GridSize(kargs.tile_partitioner);
+        dim3 grids  = Kernel::GridSize(kernel_args.tile_partitioner);
         dim3 blocks = Kernel::BlockSize();
 
-        if(!Kernel::IsSupportedArgument(kargs))
+        if(!Kernel::IsSupportedArgument(kernel_args))
         {
             throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
         }
 
-        if(s.log_level_ > 0)
+        if(stream_config.log_level_ > 0)
         {
             std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
                       << "shape: " << GemmShape::GetName() << '\n'
@@ -109,7 +115,7 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
             {
                 // Clear the output C tensor results after each repetition of the kernel
                 hipGetErrorString(hipMemsetAsync(
-                    args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                    args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream_config.stream_id_));
             }
             else if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)
             {
@@ -120,45 +126,47 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
 
         std::function<void()> preprocess = reset_data_buffers;
 
-        float ave_time = ck_tile::launch_kernel_time_mask(
-            s,
-            preprocess,
-            ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        float average_time =
+            ck_tile::launch_kernel_time_mask(stream_config,
+                                             preprocess,
+                                             ck_tile::make_kernel<GemmConfiguration::BLOCK_PER_CU>(
+                                                 Kernel{}, grids, blocks, 0, kernel_args));
 
-        ck_tile::index_t num_wgs_per_tile = kargs.tile_partitioner.estimate_num_wgs_per_tile();
-        return std::tuple{ave_time, num_wgs_per_tile};
+        ck_tile::index_t num_wgs_per_tile =
+            kernel_args.tile_partitioner.estimate_num_wgs_per_tile();
+        return std::tuple{average_time, num_wgs_per_tile};
     };
 
     if constexpr(ck_tile::StreamKReductionStrategy::Atomic == ReductionStrategy)
     {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              // Since we are doing stream K, in the case of
-                                              // atomics, multiple workgroups may write to the same
-                                              // output tile in the C tensor, so we must atomic add
-                                              // the results (not set)
-                                              ck_tile::memory_operation_enum::atomic_add>{});
+        return runKernel(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                    // Since we are doing stream K, in the case of
+                                                    // atomics, multiple workgroups may write to the
+                                                    // same output tile in the C tensor, so we must
+                                                    // atomic add the results (not set)
+                                                    ck_tile::memory_operation_enum::atomic_add>{});
     }
     else // We are using ck_tile::StreamKReductionStrategy::Reduction
     {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              // In this case, there is only ever 1 WG writing final
-                                              // results to each macro tile in the C tensor, so we
-                                              // can do a set.
-                                              ck_tile::memory_operation_enum::set>{});
+        return runKernel(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                    // In this case, there is only ever 1 WG writing
+                                                    // final results to each macro tile in the C
+                                                    // tensor, so we can do a set.
+                                                    ck_tile::memory_operation_enum::set>{});
     }
 }
 
 #include "run_gemm_example.inc"
 
-template <typename GemmConfig, typename TypeConfig>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+template <typename GemmConfiguration, typename TypeConfiguration>
+int runGemmExamplePrecisionType(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
     using Row = ck_tile::tensor_layout::gemm::RowMajor;
     using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
 
     if(a_layout == "R" && b_layout == "C")
     {
-        return run_gemm_example_with_layouts<GemmConfig, TypeConfig>(
+        return runGemmExampleWithLayouts<GemmConfiguration, TypeConfiguration>(
             argc, argv, Row{}, Col{}, Row{});
     }
     else
@@ -169,72 +177,74 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     return 0;
 }
 
-template <template <typename PreType, bool Persistent_> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
+template <template <typename PrecisionType, bool IsPersistent> typename GemmConfiguration>
+int runGemmExample(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    auto [result, arg_parser] = createArgs(argc, argv);
     if(!result)
         return -1;
 
-    std::string data_type = arg_parser.get_str("prec");
-    std::string a_layout  = arg_parser.get_str("a_layout");
-    std::string b_layout  = arg_parser.get_str("b_layout");
-    auto persistent_dp    = arg_parser.get_bool("persistent_dp");
+    std::string data_type         = arg_parser.get_str("prec");
+    std::string a_layout          = arg_parser.get_str("a_layout");
+    std::string b_layout          = arg_parser.get_str("b_layout");
+    auto persistent_data_parallel = arg_parser.get_bool("persistent_dp");
 
     if(data_type == "bf16")
     {
-        using TypeConfig = StreamKGemmTypeConfig<ck_tile::bf16_t>;
-        if(persistent_dp)
+        using TypeConfiguration = StreamKGemmTypeConfiguration<ck_tile::bf16_t>;
+        if(persistent_data_parallel)
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t, true>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::bf16_t, true>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
         else
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t, false>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::bf16_t, false>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
     }
     else if(data_type == "fp16")
     {
-        using TypeConfig = StreamKGemmTypeConfig<ck_tile::half_t>;
-        if(persistent_dp)
+        using TypeConfiguration = StreamKGemmTypeConfiguration<ck_tile::half_t>;
+        if(persistent_data_parallel)
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t, true>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::half_t, true>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
         else
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t, false>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::half_t, false>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
     }
     else if(data_type == "fp8")
     {
-        using TypeConfig = StreamKGemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>;
-        if(persistent_dp)
+        using TypeConfiguration =
+            StreamKGemmTypeConfiguration<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>;
+        if(persistent_data_parallel)
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t, true>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::fp8_t, true>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
         else
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t, false>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::fp8_t, false>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
     }
     else if(data_type == "bf8")
     {
-        using TypeConfig = StreamKGemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>;
-        if(persistent_dp)
+        using TypeConfiguration =
+            StreamKGemmTypeConfiguration<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>;
+        if(persistent_data_parallel)
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t, true>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::bf8_t, true>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
         else
         {
-            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t, false>, TypeConfig>(
-                a_layout, b_layout, argc, argv);
+            return runGemmExamplePrecisionType<GemmConfiguration<ck_tile::bf8_t, false>,
+                                               TypeConfiguration>(a_layout, b_layout, argc, argv);
         }
     }
     else
@@ -247,5 +257,5 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
-    return !run_gemm_example<GemmConfigMemoryInterwave>(argc, argv);
+    return !runGemmExample<GemmConfigurationMemoryInterwave>(argc, argv);
 }

From 6219b12730e29c357a02177dbee6e565987fcc56 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@streamhpc.com>
Date: Sat, 13 Dec 2025 15:33:41 +0100
Subject: [PATCH 05/10] [CK_BUILDER] convolution testing (#3267)

* Add README.md for testing

* Add tensor_memory_manager.

* ck-builder: tensor memory manager rebase fixes

This fixes some issues caused by the API being changed recently.
Also, this streamlines the ckt namespace to always be ck_tile::builder::test,
as this is already being used by other tests

Really, this commit should be squashed into the previous,
but I'm keeping it separate for brevity.

* ck-builder: test arguments initial prototype

* ck-builder: test system initial prototype

* ck-builder: fix non-standardized copyright comments

* ck-builder: new prototype

* ck-builder: group testing inputs/outputs into a separate structure

This is basically the return of the tensor memory manager after all,
except that the design is more closely tied to the actual operation.
Using a struct allows us to add additional input/output tensors
without breaking code (by defaulting those new parameters). Note
that the tensors are split into a separate inputs/outputs because we
usually want to allocate the output _twice_: once for the real
computation and once for the reference computation.

* ck-builder: simplify prototype naming; start docs

* ck-builder: update testing readme

* ck-builder: testing documentation

* ck-builder: HipStatusMatcher

This matcher can be used to check HIP status codes and provide
nice and readable error messages.

* ck-builder: tensor_buffer.hpp tests

* ck-builder: conv_fwd.hpp tests

* ck-builder: add example end-to-end test in conv fwd 2d fp16

* ck-builder: simplify extent usage

* ck-builder: update testing doc

* ck-builder: skip end to end test on non-gfx9

* fix check_copyright_year interpreter

/bin/bash is not guaranteed to exist on Linux. Signed,
a NixOS user

* ck-builder: fix copyrights

* ck-builder: reduce conv fwd testing size

This test allocated 24GB of memory, too much for 16GB cards.

---------

Co-authored-by: John Shumway <jshumway@amd.com>
---
 experimental/builder/README.md                |  10 +-
 .../include/ck_tile/builder/testing/README.md | 347 ++++++++++++++++++
 .../ck_tile/builder/testing/conv_fwd.hpp      | 256 +++++++++++++
 .../ck_tile/builder/testing/conv_fwd_ck.hpp   | 102 +++++
 .../ck_tile/builder/testing/extent.hpp        |  36 ++
 .../ck_tile/builder/testing/tensor_buffer.hpp | 212 +++++++++++
 .../ck_tile/builder/testing/testing.hpp       | 260 +++++++++++++
 .../ck_tile/builder/testing/type_traits.hpp   |  43 +++
 experimental/builder/test/CMakeLists.txt      |  33 +-
 .../conv/ck/test_ckb_conv_fwd_2d_fp16.cpp     | 111 ++++--
 .../builder/test/test_testing_utils.cpp       |  11 +
 experimental/builder/test/testing_utils.cpp   |  42 +++
 experimental/builder/test/testing_utils.hpp   |  41 +++
 .../builder/test/unit_conv_fwd_testing.cpp    |  83 +++++
 .../builder/test/unit_device_buffer.cpp       |  81 ++++
 .../builder/test/unit_tensor_descriptor.cpp   |  47 +++
 script/check_copyright_year.sh                |   2 +-
 17 files changed, 1660 insertions(+), 57 deletions(-)
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/README.md
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/extent.hpp
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/testing.hpp
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/type_traits.hpp
 create mode 100644 experimental/builder/test/unit_conv_fwd_testing.cpp
 create mode 100644 experimental/builder/test/unit_device_buffer.cpp
 create mode 100644 experimental/builder/test/unit_tensor_descriptor.cpp

diff --git a/experimental/builder/README.md b/experimental/builder/README.md
index 18e9e58739..940ee3e503 100644
--- a/experimental/builder/README.md
+++ b/experimental/builder/README.md
@@ -12,19 +12,21 @@ This project is a prototype for a more general builder pattern for all of compos
 
 ## Design descriptions
 
-- [CK Builder design description](include/ck_tile/builder/README.md) 
+- [CK Builder design description](include/ck_tile/builder/README.md)
+- [CK Builder factory design](include/ck_tile/builder/factory/README.md)
+- [CK Builder testing design](include/ck_tile/builder/testing/README.md)
 
 ## Directory Structure
 
-- `include/ck_tile/builder/`  
+- `include/ck_tile/builder/`
   Core builder headers and public API.
 - `include/ck_tile/builder/reflect`
   Reflection mechanism.
 - `include/ck_tile/builder/factory`
   Compile-time dispatch from builder descriptors to our exisitng specialized convolution kernel implementations.
-- `test/`  
+- `test/`
   Unit tests and example usage of the builder pattern.
-- `CMakeLists.txt`  
+- `CMakeLists.txt`
   CMake configuration for building the experimental builder and its tests.
 
 ## CMake Configuration
diff --git a/experimental/builder/include/ck_tile/builder/testing/README.md b/experimental/builder/include/ck_tile/builder/testing/README.md
new file mode 100644
index 0000000000..85adc59d80
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/README.md
@@ -0,0 +1,347 @@
+# CK-Builder Testing Utilities
+
+This directory contains testing utilities designed to simplify the process of writing unit tests for GPU kernels built with `ck_tile::builder`. These utilities enable a clean, expressive **Given-When-Then** (Given-When-Then) testing pattern that separates test setup, execution, and validation.
+
+See the [main builder documentation](../README.md) for an overview of the CK-Builder API components.
+
+## Overview
+
+Testing GPU kernels typically involves significant boilerplate: allocating device memory, initializing test data, launching kernels, and validating results. The utilities in this directory abstract away these repetitive tasks, allowing you to focus on defining test cases and verifying correctness.
+
+The core components are:
+
+- **`Args`**: A struct template that holds runtime parameters for a specific test case.
+- **`Input`** and **`Output`**: Helper classes that groups operation inputs and outputs.
+- **`Validator`**: A utility that performs on-GPU validation and integrates with GoogleTest/GoogleMock.
+
+Together, these components enable a structured approach to kernel testing that mirrors the Given-When-Then pattern commonly used in behavior-driven development.
+
+## The Given-When-Then Testing Pattern
+
+The Given-When-Then pattern organizes tests into three distinct phases:
+
+1. **Given**: Set up the preconditions and test data
+2. **When**: Execute the action being tested
+3. **Then**: Verify the expected outcome
+
+This structure makes tests easier to read, write, and maintain. Each phase has a clear purpose, and the testing utilities are designed to support this workflow.
+
+### Given: Defining the Test Case
+
+The "Given" phase establishes the context for your test. This includes both the compile-time characteristics of the kernel and the runtime parameters for the specific test case.
+
+#### Operation Signature
+
+The "signature" defines the **mathematical contract** that the kernel must satisfy. It specifies compile-time properties such as:
+
+- Spatial dimensionality (1D, 2D, or 3D)
+- Convolution direction (Forward, Backward Data, Backward Weight)
+- Tensor memory layout (e.g., NHWC, NCHW)
+- Data types (FP32, FP16, BF16, etc.)
+- Fused element-wise operations (e.g., Bias, ReLU)
+
+The format of the signature struct is enforced at compile time using C++20 concepts by the CK-Builder API, ensuring type safety and enabling compile-time optimizations. The design of these concepts and the required constraints are discussed in the [CK Builder design description](../include/ck_tile/builder/README.md).
+
+```cpp
+// Define our custom signature struct.
+struct ConvSignature {
+    int spatial_dim = 2;
+    ck_tile::builder::ConvDirection direction =
+        ck_tile::builder::ConvDirection::FORWARD;
+    ck_tile::builder::GroupConvLayout2D layout =
+        ck_tile::builder::GroupConvLayout2D::NHWGC_GKYXC_NHWGK;
+    ck_tile::builder::DataType data_type =
+        ck_tile::builder::DataType::FP16;
+    ck_tile::builder::ElementwiseOperation elementwise_operation =
+        ck_tile::builder::ElementwiseOperation::NONE;
+};
+
+// Double-check that out structure is well-defined according to the CK-Builder API.
+static_assert(ck_tile::builder::ConvSignatureDescriptor<ConvSignature>);
+
+// Instantiate the signature with a configuration. These values are again checked
+// by the CK-Builder API when a device operation is built.
+constexpr auto SIGNATURE = ConvSignature{
+    .spatial_dim = 2,
+    .direction = ck_tile::builder::ConvDirection::FORWARD,
+    .layout = ck_tile::builder::GroupConvLayout2D::NHWGC_GKYXC_NHWGK,
+    .data_type = ck_tile::builder::DataType::FP16,
+    .elementwise_operation = ck_tile::builder::ElementwiseOperation::NONE,
+};
+```
+
+#### Run-time Arguments
+
+The `Args` struct template provides the **runtime parameters** for your test case. It is parameterized by the `SIGNATURE` and contains fields for tensor dimensions, strides, dilations, and other dynamic properties. Note that the exact parameters required for each `Args` depends on the `SIGNATURE`: For example, a `SIGNATURE` that represents a forward convolution requires specifying the number of batches, groups, input- and output-channels, filter dimensions, filter strides, and so on. A `SIGNATURE` that represents a simple GEMM operation may instead require only the dimensions of the A-, B- and C-matrices.
+
+```cpp
+    ck_tile::builder::test::Args<SIGNATURE> args = {
+    .lengths = {
+        .batch_size      = 128,
+        .groups          = 1,
+        .input_channels  = 64,
+        .output_channels = 128,
+        .image           = {.height = 56, .width = 56},
+        .filter          = {.height = 3,  .width = 3},
+    },
+    .filter_strides  = {.height = 1, .width  = 1},
+    .filter_dilation = {.height = 1, .width  = 1},
+    .input_left_pad  = {.width  = 1, .height = 1},
+    .input_right_pad = {.width  = 1, .height = 1},
+};
+```
+
+#### Tensor Memory Management
+
+Tensor memory is passed using the `Inputs<SIGNATURE>` and `Outputs<SIGNATURE>` structures. These group all inputs and outputs for an operation. Note that these structures do not "own" the memory inside: They only logically group the inputs so that they can be passed as a common type. The amount of inputs and outputs may differ depending on the `SIGNATURE`, and this avoids having to pass additional values and accept additional parameters in those situations.
+
+The exact fields in `Inputs` and `Outputs` depend again on the particular `SIGNATURE` that they are constructed with. In general, these structures are intended to be freely constructible from external data and only serve to group relevant information. Automatic memory management can be performed using the `UniqueInputs<SIGNATURE>` and `UniqueOutputs<SIGNATURE>` structures instead. The `alloc_inputs` and `alloc_outputs` functions are used to initialize these types: They take an `Args` structure and allocate the appropriate amounts of memory. `.get()` is used to return an instance of the appropriate `Input` or `Output`.
+
+```cpp
+auto inputs = ck_tile::builder::test::allocate_inputs(args);
+auto outputs = ck_tile::builder::test::allocate_outputs(args);
+```
+
+Note that these functions merely _allocate_ memory: After allocation, the memory is still uninitialized.
+
+#### Tensor Memory Initialization
+
+Operation inputs can be initialized by using `ck_tile::builder::test::init_inputs()`. Crucially, this operation accepts _all_ inputs, as well as the `args` structure. This is because initializing tensor memory is a context-dependent operation: We need to understand the operation in detail in order to generate inputs which do not overflow, do not generate NaNs or all zeros, etc. Passing the `args` allows `init_inputs` to generate a good test for the operation at hand.
+
+### When: Executing the Kernel
+
+The "When" phase is where the kernel to be tested is actually executed. This involves selecting an algorithm and using the `Builder` to generate the kernel.
+
+#### Operation Algorithm
+
+The "algorithm" defines the **implementation strategy** for the kernel. It specifies low-level details such as:
+
+- Thread block dimensions and tile sizes
+- GEMM implementation (XDL or WMMA)
+- Data transfer vectorization
+- Pipeline scheduling
+
+As with the signature struct, the format of the algorithm struct is enforced at compile time using C++20 concepts by the CK-Builder API. The design of these concepts and the required constraints are discussed in the [CK Builder factory design description](../include/ck_tile/builder/factory/README.md).
+
+
+```cpp
+// Define our custom algorithm struct.
+struct ConvAlgorithm {
+    // Thread block configuration
+    ThreadBlock thread_block;
+
+    // Gridwise GEMM configuration
+    GridwiseXdlGemm gridwise_gemm;
+
+    // Block transfer configuration
+    Transfer transfer;
+
+    // Additional tuning parameters
+    // ...
+};
+
+// Double-check that our algorithm is well-defined according to the CK-Builder API.
+static_assert(ck_tile::builder::ConvAlgorithmDescriptor<ConvAlgorithm>);
+
+// Instantiate the algorithm with a configuration. Like with the signature struct
+// the CK-Builder API will check that the values are correct when a device
+// operation is built.
+constexpr auto ALGORITHM = ConvAlgorithm{
+    .thread_block = /* ... */;
+    .gridwise_gem = /* ... */;
+    .transfer = /* ... */;
+    // ...
+};
+```
+
+#### Building the Kernel
+
+The `Builder` combines the signature (what to compute) with the algorithm (how to compute it) to generate a kernel type which represents the operation. The implementation details, including invocation method, depend on the particular signature and algorithm.
+
+```cpp
+using Conv = ck_tile::builder::ConvBuilder<SIGNATURE, ALGORITHM>::Instance;
+auto conv = Conv{};
+```
+
+#### Invoking the Kernel
+
+After creating the kernel instance, it can be invoked by passing the instance, the arguments, the inputs, and the outputs to `run()`. This operation writes results into the buffers in `outputs`.
+
+```cpp
+ck_tile::builder::test::run(conv, args, inputs.get(), outputs.get());
+```
+
+### Then: Verifying the Results
+
+The "Then" phase validates that the kernel produced the expected output. This is done by running a reference kernel and comparing the results.
+
+#### Building the Reference Kernel
+
+The reference kernel is just another kernel instance of the builder, one that's been externally verified to produce the correct results. As this kernel is also running on the GPU, we can use it to perform tests far more quickly than when comparing the outputs to a CPU-based reference implementation.
+
+In order to obtain an instance of the reference kernel, the correct `ALGORITHM` needs to be passed to the `Builder`.
+
+```cpp
+struct ReferenceAlgorithm {
+    ck_tile::builder::ConvAlgorithmSpecialization specialization;
+};
+static_assert(ck_tile::builder::ConvAlgorithmDescriptor<ReferenceAlgorithm>);
+constexpr auto REFERENCE_ALGORITHM = ReferenceAlgorithm{
+    .specialization = ck_tile::builder::ConvAlgorithmSpecialization::REFERENCE;
+};
+using ReferenceConv = ck_tile::builder::ConvBuilder<SIGNATURE, REFERENCE_ALGORITHM>::Instance;
+auto reference_conv = ReferenceConv{};
+```
+
+This instance can then be invoked using `ck_tile::builder::test::run()`, the same as the kernel to be tested. Note that another instance of the `Outputs` structure needs to be passed here in order to store the results.
+
+```cpp
+auto reference_outputs = ck_tile::builder::test::allocate_outputs(args);
+ck_tile::builder::test::run(conv, args, inputs.get(), reference_outputs.get());
+```
+
+#### `Validator<SIGNATURE>`
+
+The `Validator` class encapsulates the validation logic. It performs on-GPU correctness checks by comparing two instances of the `Outputs` structure.
+
+```cpp
+ck_tile::builder::test::Validator<SIGNATURE> validator(outputs.get(), reference_outputs.get());
+```
+
+The `Validator` provides methods that return GoogleMock matchers, enabling clean integration with GoogleTest:
+
+```cpp
+EXPECT_THAT(validator.result(), validator.matches_reference_output());
+```
+
+The `matches_reference_output()` matcher checks that the output is numerically correct within acceptable tolerances. The `Validator` can also provide more detailed diagnostics, such as:
+
+- Maximum absolute error
+- Maximum relative error
+- Number of mismatched elements
+- Specific locations of errors
+
+## Complete Example
+
+Here's a complete test that demonstrates the Given-When-Then pattern:
+
+```cpp
+#include <gtest/gtest.h>
+#include "ck_tile/builder/conv_signature_concepts.hpp"
+#include "ck_tile/builder/conv_algorithm_concepts.hpp"
+#include "ck_tile/builder/conv_builder.hpp"
+#include "ck_tile/testing/tensor_memory_manager.hpp"
+#include "ck_tile/testing/validator.hpp"
+
+// Define the convolution signature
+struct ConvSignature {
+    int spatial_dim = 2;
+    ck_tile::builder::ConvDirection direction =
+        ck_tile::builder::ConvDirection::FORWARD;
+    ck_tile::builder::GroupConvLayout2D layout =
+        ck_tile::builder::GroupConvLayout2D::NHWGC_GKYXC_NHWGK;
+    ck_tile::builder::DataType data_type =
+        ck_tile::builder::DataType::FP16;
+    ck_tile::builder::ElementwiseOperation elementwise_operation =
+        ck_tile::builder::ElementwiseOperation::NONE;
+};
+static_assert(ck_tile::builder::ConvSignatureDescriptor<ConvSignature>);
+constexpr auto SIGNATURE = ConvSignature{
+    .spatial_dim = 2,
+    .direction = ck_tile::builder::ConvDirection::FORWARD,
+    .layout = ck_tile::builder::GroupConvLayout2D::NHWGC_GKYXC_NHWGK,
+    .data_type = ck_tile::builder::DataType::FP16,
+    .elementwise_operation = ck_tile::builder::ElementwiseOperation::NONE,
+};
+
+// Define the convolution algorithm
+struct ConvAlgorithm {
+    // Algorithm configuration details...
+    // (Omitted for brevity)
+};
+static_assert(ck_tile::builder::ConvAlgorithmDescriptor<ConvAlgorithm>);
+constexpr auto ALGORITHM = ConvAlgorithm{/* ... */};
+
+// Define the reference convolution algorithm
+struct ReferenceAlgorithm {
+    ck_tile::builder::ConvAlgorithmSpecialization specialization;
+};
+static_assert(ck_tile::builder::ConvAlgorithmDescriptor<ReferenceAlgorithm>);
+constexpr auto REFERENCE_ALGORITHM = ReferenceAlgorithm{
+    .specialization = ck_tile::builder::ConvAlgorithmSpecialization::REFERENCE;
+};
+
+// The actual test
+TEST(ConvolutionTest, Forward2D_FP16) {
+    // ===== GIVEN: Set up the test case =====
+
+    // Define runtime parameters
+    ck_tile::builder::test::Args<ConvSignature> args = {
+        .lengths = {
+            .batch_size      = 128,
+            .groups          = 1,
+            .input_channels  = 64,
+            .output_channels = 128,
+            .image           = {.height = 56, .width = 56},
+            .filter          = {.height = 3,  .width = 3},
+        },
+        .filter_strides  = {.height = 1, .width  = 1},
+        .filter_dilation = {.height = 1, .width  = 1},
+        .input_left_pad  = {.width  = 1, .height = 1},
+        .input_right_pad = {.width  = 1, .height = 1},
+    };
+
+    // Allocate GPU memory
+    auto inputs = ck_tile::builder::test::allocate_inputs(args);
+    auto outputs = ck_tile::builder::test::allocate_outputs(args);
+    auto reference_outputs = ck_tile::builder::test::allocate_outputs(args);
+
+    // Initialize inputs
+    ck_tile::builder::test::init_inputs(args, inputs);
+
+    // ===== WHEN: Execute the kernel =====
+
+    // Build the kernel
+    using Conv = ck_tile::builder::ConvBuilder<SIGNATURE, ALGORITHM>::Instance;
+    auto conv = Conv{};
+
+    // Compute actual results
+    ck_tile::builder::test::run(conv, args, inputs.get(), outputs.get());
+
+    // ===== THEN: Verify the results =====
+
+    // Build the reference kernel
+    using ReferenceConv = ck_tile::builder::ConvBuilder<SIGNATURE, REFERENCE_ALGORITHM>::Instance;
+    auto reference_conv = ReferenceConv{};
+
+    // Compute reference results
+    ck_tile::builder::test::run(conv, args, inputs.get(), reference_outputs.get());
+
+    // Check the results
+    ck_tile::builder::test::Validator<SIGNATURE> validator(outputs.get(), reference_outputs.get());
+    EXPECT_THAT(validator.result(), validator.is_ok());
+}
+```
+
+## Benefits of This Approach
+
+1. **Clarity**: The Given-When-Then structure makes tests self-documenting. Each phase has a clear purpose.
+
+2. **Reduced Boilerplate**: The utilities handle memory management, initialization, and validation, eliminating repetitive code.
+
+3. **Type Safety**: The use of C++20 concepts ensures that signatures and algorithms are well-formed at compile time.
+
+4. **Flexibility**: The `Args` struct can be easily extended to support different test scenarios, `Inputs` and `Outputs` can be modified to support additional tensors where necessary, and alternatives to `init_inputs()` can be provided to support additional testing strategies.
+
+5. **Integration**: The `Validator` integrates seamlessly with GoogleTest/GoogleMock, providing familiar assertion syntax.
+
+6. **Maintainability**: Changes to the testing infrastructure are localized to the utility classes, not scattered across individual tests.
+
+## Future Enhancements
+
+Potential improvements to the testing utilities include:
+
+- Performance benchmarking utilities
+- Automatic test case generation from parameter ranges
+- Enhanced error reporting with visual diffs
+- Support for multi-GPU testing scenarios
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
new file mode 100644
index 0000000000..f329a8a4d3
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
@@ -0,0 +1,256 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/builder/conv_signature_concepts.hpp"
+#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
+#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
+#include "ck_tile/builder/testing/testing.hpp"
+#include "ck_tile/builder/testing/extent.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+/// This file implements common functionality for invoking/testing grouped
+/// forward convolutions created through the CK Builder API. The main item
+/// of it is the ConvArgs structure - which contains a complete description
+/// of a convolution operation.
+///
+/// It is not intended that this file contains implementation details for
+/// actually launching a convolution operation. As this can be done
+/// through different APIs depending on the kernel (CK, CK Tile, or a
+/// reference implementation), the code dealing with that is split out
+/// into a separate header for each implementation.
+
+namespace ck_tile::builder::test {
+
+/// @brief Convolution tensor dimensions.
+///
+/// This structure is used to describe lengths of a convolution problem. In
+/// fact, this structure is a complete description of ALL inputs and outputs
+/// lengths of a convolution problem, as this structure contains all of the
+/// combined parameters. Note that we can't also use this structure to describe
+/// tensor strides: whereas the lengths are all governed by a common set of
+/// parameters, strides of the input, weight, and output tensor are all
+/// independent.
+template <int SPATIAL_DIM>
+struct ConvTensorLengths
+{
+    size_t batch_size          = 1;  // N
+    size_t groups              = 1;  // G
+    size_t input_channels      = 1;  // C
+    size_t output_channels     = 1;  // K
+    Extent<SPATIAL_DIM> image  = {}; // W, H, D
+    Extent<SPATIAL_DIM> filter = {}; // X, Y, Z
+};
+
+/// @brief `Args` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see Args
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+struct Args<SIGNATURE>
+{
+    constexpr static auto SPATIAL_DIM = SIGNATURE.spatial_dim;
+    constexpr static auto INPUT_TYPE  = SIGNATURE.data_type;
+    constexpr static auto WEIGHT_TYPE = SIGNATURE.data_type;
+    constexpr static auto OUTPUT_TYPE = SIGNATURE.data_type;
+
+    // TODO: We shouldn't need to call into an internal namespace here.
+    using Ops = factory::internal::ElementwiseOps<SIGNATURE>;
+
+    // TODO: We shouldn't need to call into an internal namespace here.
+    using Layouts =
+        factory::internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM, ConvDirection::FORWARD>;
+
+    ConvTensorLengths<SPATIAL_DIM> lengths;
+
+    // TODO: Tensor strides. This needs a new structure as well as some
+    // reworking of the make_*_descriptor() functions, as the current
+    // implementation (based on ConvParam in old CK / CK Tile) does not
+    // support strides at all.
+
+    Extent<SPATIAL_DIM> filter_strides;
+    Extent<SPATIAL_DIM> filter_dilation;
+    Extent<SPATIAL_DIM> input_left_pad;
+    Extent<SPATIAL_DIM> input_right_pad;
+
+    Ops::AElementwiseOp a_elementwise_op;
+    Ops::BElementwiseOp b_elementwise_op;
+    Ops::CDEElementwiseOp cde_elementwise_op;
+
+    /// This function returns the `TensorDescriptor` corresponding to
+    /// the input-tensor of the convolution problem. This can then
+    /// be used to, for example, allocate memory.
+    TensorDescriptor<INPUT_TYPE> make_input_descriptor() const
+    {
+        // TODO: We're using old CK functionality to compute the right
+        // values here, mainly because CK tile does not support the
+        // right tensor layouts here. We should probably change that
+        // because CK currently prints an annoying message about it,
+        // plus that would let us get rid of the `to_ck_conv_param()`
+        // function.
+        const auto param = to_ck_conv_param();
+        const auto desc  = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
+             typename Layouts::ALayout>(param);
+        return TensorDescriptor<INPUT_TYPE>(desc.GetLengths(), desc.GetStrides());
+    }
+
+    /// This function returns the `TensorDescriptor` corresponding to
+    /// the weight-tensor of  the convolution problem. This can then
+    /// be used to, for example, allocate memory.
+    TensorDescriptor<WEIGHT_TYPE> make_weight_descriptor() const
+    {
+        // See note in implementation of `make_input_descriptor`.
+        const auto param = to_ck_conv_param();
+        const auto desc  = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
+             typename Layouts::BLayout>(param);
+        return TensorDescriptor<WEIGHT_TYPE>(desc.GetLengths(), desc.GetStrides());
+    }
+
+    /// This function returns the `TensorDescriptor` corresponding to
+    /// the output-tensor of the convolution problem. This can then
+    /// be used to, for example, allocate memory.
+    TensorDescriptor<OUTPUT_TYPE> make_output_descriptor() const
+    {
+        // See note in implementation of `make_input_descriptor`.
+        const auto param = to_ck_conv_param();
+        const auto desc  = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
+             typename Layouts::ELayout>(param);
+        return TensorDescriptor<OUTPUT_TYPE>(desc.GetLengths(), desc.GetStrides());
+    }
+
+    /// Convert the Args structure into a CK conv_param structure. This
+    /// function is mainly used to be able to use the existing
+    /// CK-functionality to obtain tensor descriptors.
+    ck::utils::conv::ConvParam to_ck_conv_param() const
+    {
+        const auto to_vector = [](const auto& extent) {
+            if constexpr(SPATIAL_DIM == 1)
+                return std::vector<ck::index_t>{ck::index_t(extent.width)};
+            else if constexpr(SPATIAL_DIM == 2)
+                return std::vector<ck::index_t>{ck::index_t(extent.height),
+                                                ck::index_t(extent.width)};
+            else
+                return std::vector<ck::index_t>{ck::index_t(extent.depth),
+                                                ck::index_t(extent.height),
+                                                ck::index_t(extent.width)};
+        };
+
+        return ck::utils::conv::ConvParam(SPATIAL_DIM,
+                                          this->lengths.groups,
+                                          this->lengths.batch_size,
+                                          this->lengths.output_channels,
+                                          this->lengths.input_channels,
+                                          to_vector(this->lengths.filter),
+                                          to_vector(this->lengths.image),
+                                          to_vector(this->filter_strides),
+                                          to_vector(this->filter_dilation),
+                                          to_vector(this->input_left_pad),
+                                          to_vector(this->input_right_pad));
+    }
+};
+
+/// @brief `Inputs` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see Inputs
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+struct Inputs<SIGNATURE>
+{
+    void* input;
+    void* weight;
+};
+
+/// @brief `Outputs` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see Outputs
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+struct Outputs<SIGNATURE>
+{
+    void* output;
+};
+
+/// @brief `UniqueInputs` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see UniqueInputs
+/// @see ValidUniqueInputs
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+struct UniqueInputs<SIGNATURE>
+{
+    DeviceBuffer input_buf;
+    DeviceBuffer weight_buf;
+
+    /// @see ValidUniqueInputs
+    Inputs<SIGNATURE> get()
+    {
+        return {
+            .input  = input_buf.get(),
+            .weight = weight_buf.get(),
+        };
+    }
+};
+
+/// @brief `UniqueOutputs` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see UniqueOutputs
+/// @see ValidUniqueOutputs
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+struct UniqueOutputs<SIGNATURE>
+{
+    DeviceBuffer output_buf;
+
+    /// @see ValidUniqueOutputs
+    Outputs<SIGNATURE> get()
+    {
+        return {
+            .output = output_buf.get(),
+        };
+    }
+};
+
+/// @brief `alloc_inputs()` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see alloc_inputs()
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE> &&
+             ValidUniqueInputs<SIGNATURE>
+UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args)
+{
+    return {
+        .input_buf  = alloc_tensor_buffer(args.make_input_descriptor()),
+        .weight_buf = alloc_tensor_buffer(args.make_weight_descriptor()),
+    };
+}
+
+/// @brief `alloc_outputs()` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see alloc_outputs()
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE> &&
+             ValidUniqueOutputs<SIGNATURE>
+UniqueOutputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args)
+{
+    return {
+        .output_buf = alloc_tensor_buffer(args.make_output_descriptor()),
+    };
+}
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp
new file mode 100644
index 0000000000..cc5c613d95
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp
@@ -0,0 +1,102 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <span>
+#include <cstddef>
+
+#include "ck_tile/builder/testing/conv_fwd.hpp"
+
+/// This file contains the implementation details for invoking/testing
+/// grouped convolution operations in old CK. The main item is the
+/// `run()` function, which is the main implementation used to invoke
+/// CK grouped forward convolution kernels.
+
+namespace ck_tile::builder::test {
+
+/// @brief Concept for checking whether a convolution is invoked like old CK.
+///
+/// This concept is used to tell whether a convolution implementation is
+/// likely to be an "old CK" implementation - that is, whether we should
+/// invoke it as an old CK kernel. This is mainly used with `run()` to
+/// differentiate which implementation that should be invoked.
+///
+/// - SIGNATURE is the operation signature.
+/// - Conv is a convolution instance created by the CK Builder API.
+template <auto SIGNATURE, typename Conv>
+concept IsCkConvInstance =
+    // TODO: This should be implemented by converting the signature into the
+    // type parameters for DeviceGroupedConvFwdMultipleABD. For now, just leave
+    // it empty. Improve when needed, you get the point. Also we should probably
+    // move this to the ck conv factory helper.
+    true;
+
+/// @brief `run()` specialization for forward convolution and old CK.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+/// @throws std::runtime_error if the arguments werent actually valid for the
+/// operation. This should be caught and reported by the testing framework.
+///
+/// @see run()
+template <auto SIGNATURE, typename Conv>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE> &&
+             IsCkConvInstance<SIGNATURE, Conv>
+void run(Conv& conv,
+         const Args<SIGNATURE>& args,
+         const Inputs<SIGNATURE>& inputs,
+         const Outputs<SIGNATURE>& outputs)
+{
+    constexpr auto spatial_dim = SIGNATURE.spatial_dim;
+
+    const auto copy = [](const auto& src, auto& dst) {
+        std::copy(src.begin(), src.end(), dst.begin());
+    };
+
+    const auto to_ck_lengths = [&](const auto& src) {
+        std::array<ck::index_t, spatial_dim + 3> result;
+        copy(src, result);
+        return result;
+    };
+
+    const auto to_ck_extent = [&](const auto& extent) {
+        std::array<ck::index_t, spatial_dim> result;
+        copy(extent, result);
+        return result;
+    };
+
+    const auto param = args.to_ck_conv_param();
+
+    const auto input_desc  = args.make_input_descriptor();
+    const auto weight_desc = args.make_weight_descriptor();
+    const auto output_desc = args.make_output_descriptor();
+
+    auto ck_args = conv.MakeArgument(inputs.input,
+                                     inputs.weight,
+                                     {},
+                                     outputs.output,
+                                     to_ck_lengths(input_desc.get_lengths()),
+                                     to_ck_lengths(input_desc.get_strides()),
+                                     to_ck_lengths(weight_desc.get_lengths()),
+                                     to_ck_lengths(weight_desc.get_strides()),
+                                     {},
+                                     {},
+                                     to_ck_lengths(output_desc.get_lengths()),
+                                     to_ck_lengths(output_desc.get_strides()),
+                                     to_ck_extent(param.conv_filter_strides_),
+                                     to_ck_extent(param.conv_filter_dilations_),
+                                     to_ck_extent(param.input_left_pads_),
+                                     to_ck_extent(param.input_right_pads_),
+                                     args.a_elementwise_op,
+                                     args.b_elementwise_op,
+                                     args.cde_elementwise_op);
+
+    if(!conv.IsSupportedArgument(ck_args))
+    {
+        throw std::runtime_error("invalid argument");
+    }
+
+    conv.MakeInvoker().Run(ck_args, {});
+}
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/extent.hpp b/experimental/builder/include/ck_tile/builder/testing/extent.hpp
new file mode 100644
index 0000000000..a2d9b3ff4c
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/extent.hpp
@@ -0,0 +1,36 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+namespace ck_tile::builder::test {
+
+/// This structure describes a 1-, 2-, or 3-D extent. Its used to
+/// communicate 1-, 2- or 3-D sizes and strides of tensors.
+/// Depending on the dimension, the structure will have the `width`,
+/// `height`, and `depth` fields available.
+template <int SPATIAL_DIM>
+struct Extent;
+
+template <>
+struct Extent<1>
+{
+    size_t width = 1;
+};
+
+template <>
+struct Extent<2>
+{
+    size_t width  = 1;
+    size_t height = 1;
+};
+
+template <>
+struct Extent<3>
+{
+    size_t width  = 1;
+    size_t height = 1;
+    size_t depth  = 1;
+};
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp
new file mode 100644
index 0000000000..42f85f8017
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp
@@ -0,0 +1,212 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <stdexcept>
+#include <memory>
+#include <numeric>
+#include <span>
+#include <concepts>
+#include <hip/hip_runtime.h>
+#include "ck_tile/builder/conv_signature_concepts.hpp"
+#include "ck_tile/builder/testing/type_traits.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+/// This file deals with tensor memory allocation: Both the act of allocating
+/// and (automatically) deallocating memory, as well as utilities for managing
+/// the layout of tensor data in memory.
+
+namespace ck_tile::builder::test {
+
+/// @brief Automatic deleter for GPU memory.
+///
+/// This structure implements a C++ functor which can be used to configure
+/// `std::unique_ptr` to automatically delete memory using `hipFree`.
+///
+/// @see DeviceBuffer
+struct DeviceMemoryDeleter
+{
+    /// @brief Deleter callback.
+    ///
+    /// This function is invoked by `std::unique_ptr` when memory that the
+    /// pointer represents should be freed. In our implementation, we just
+    /// pass it directly to `hipFree`.
+    void operator()(std::byte* ptr) const
+    {
+        if(ptr)
+            (void)hipFree(ptr);
+    }
+};
+
+/// @brief HIP out of memory error
+///
+/// This is a derivation of `std::runtime_error` specialized for HIP
+/// out-of-memory errors.
+///
+/// @see std::runtime_error
+struct OutOfDeviceMemoryError : std::runtime_error
+{
+    /// @brief Utility for formatting out-of-memory error messages
+    ///
+    /// Returns a human-readable description of a HIP out-of-memory error.
+    ///
+    /// @param status The status to report
+    static std::string format_error(hipError_t status)
+    {
+        return std::string("failed to allocate hip memory: ") + hipGetErrorString(status) + " (" +
+               std::to_string(status) + ")";
+    }
+
+    /// @brief Construct an out-of-memory error using `status` as message.
+    ///
+    /// @param status A HIP error status that was encountered while allocating memory.
+    OutOfDeviceMemoryError(hipError_t status) : std::runtime_error(format_error(status)) {}
+};
+
+/// @brief Automatically managed GPU memory.
+///
+/// The `DeviceBuffer` is an automatically managed pointer for GPU memory. When
+/// adopting a device pointer into a `DeviceBuffer`, it will automatically be
+/// free'd when the pointer goes out of scope. Memory can be allocated directly
+/// into a `DeviceBuffer` using `alloc_buffer()` or `alloc_tensor_buffer()`.
+///
+/// Since this type is just an alias of `std::unique_ptr`, you can use that type's
+/// functionality to manage memory further, such as `.reset()` to release the
+/// memory.
+///
+/// @see alloc_buffer()
+/// @see alloc_tensor_buffer()
+using DeviceBuffer = std::unique_ptr<std::byte[], DeviceMemoryDeleter>;
+
+/// @brief Allocate automatically managed GPU memory.
+///
+/// This function essentially acts like a managed version of hipMalloc -
+/// allocating GPU memory on the currently active device - except that this
+/// version returns an automatically managed pointer.
+///
+/// @param size The amount of memory to allocate in bytes.
+/// @throws OutOfDeviceMemoryError if memory allocation failed.
+///
+/// @see DeviceBuffer
+/// @see OutOfDeviceMemoryError
+/// @see hipMalloc()
+inline DeviceBuffer alloc_buffer(size_t size)
+{
+    std::byte* d_buf = nullptr;
+    if(const auto status = hipMalloc(&d_buf, size); status != hipSuccess)
+    {
+        throw OutOfDeviceMemoryError(status);
+    }
+    return DeviceBuffer(d_buf);
+}
+
+/// @brief Type managing tensor data layout in memory.
+///
+/// This structure describes a tensor in memory. It does not actually hold any
+/// reference to memory, it just describes how the memory should be laid out if it
+/// were.
+///
+/// @note This type is very much like ck_tile::HostTensorDescriptor, except that it
+/// also  includes the data type of the elements of htis tensor. This is mainly to
+/// make the descriptor a _complete_ description of a tensor rather than just the
+/// dimensions in strides, which helps in reducing clutter in uses of this type.
+///
+/// @note All strides are still in _elements_.
+///
+/// @tparam DT The conceptual data type of the tensor elements. This need not be the
+///   type that the data is actually stored as in memory.
+template <DataType DT>
+struct TensorDescriptor
+{
+    // For now, the implementation of this type is based on
+    // `ck_tile::HostTensorDescriptor`, so that we can prototype without
+    // reimplementing the `HostTensorDescriptor` for the 3rd time. You can regard
+    // the use of `ck_tile::HostTensorDescriptor` here as an implementation detail.
+
+    /// The conceptual data type of the tensor elements. This need not be the type
+    /// that the data is actually stored as in memory.
+    constexpr static DataType data_type = DT;
+
+    /// @brief Create a tensor descriptor from lengths and strides.
+    ///
+    /// @param lengths A sequence of tensor lengths, the conceptial dimensions of
+    ///   the tensor in  elements.
+    /// @param strides A sequence of in-memory strides of the tensor, measured in
+    ///   elements. Each element of `strides`` corresponds to one at the same index
+    ///   in `lengths`, the amount of elements to skip in memory to find the next
+    ///   element along that axis.
+    TensorDescriptor(std::span<const size_t> lengths, std::span<const size_t> strides)
+        : inner_descriptor_(lengths, strides)
+    {
+        // TODO: Validation of strides? For now we just delegate the details of the
+        // construction to the CK Tile HostTensorDescriptor.
+    }
+
+    /// Query the conceptual dimensions of the tensor.
+    ///
+    /// @returns A span of tensor dimensions, one for every axis. Note that the order
+    ///   does *not* correspond with memory layout, query the in-memory strides for
+    ///   that.
+    ///
+    /// @see get_strides()
+    std::span<const size_t> get_lengths() const { return inner_descriptor_.get_lengths(); }
+
+    /// Query the in-memory strides of the tensor.
+    ///
+    /// @returns A span of tensor dimensions, one for every axis. Each element
+    ///   corresponds directly with the stride in elements at the same index in the
+    ///   tensor  dimensions.
+    ///
+    /// @see get_lengths()
+    std::span<const size_t> get_strides() const { return inner_descriptor_.get_strides(); }
+
+    /// @brief Compute total tensor size in elements.
+    ///
+    /// This function returns the total size of the memory backing a tensor with
+    /// this descriptor in *elements*, including required extra size for strides.
+    ///
+    /// @see get_element_space_size_in_bytes()
+    size_t get_element_space_size() const { return inner_descriptor_.get_element_space_size(); }
+
+    /// @brief Compute total tensor size in bytes.
+    ///
+    /// This function is like `get_element_space_size()`, except that the returned
+    /// value is measured in *bytes* rather than *elements*. Use this function for
+    /// figuring out how much memory needs to be allocated for a particular tensor.
+    ///
+    /// @see get_element_space_size()
+    size_t get_element_space_size_in_bytes() const
+    {
+        // For now, the backing type is the naive C++-type that represents the data
+        // type. When we are going to support packed types such as i4 and fp6, this
+        // is going to become more complicated.
+        return get_element_space_size() * data_type_sizeof(DT);
+    }
+
+    private:
+    ck_tile::HostTensorDescriptor inner_descriptor_;
+};
+
+/// @brief Allocate automatically managed GPU memory corresponding to a tensor descriptor.
+///
+/// This function is similar to `alloc_buffer()`, except that the required size is
+/// derived automatically from a tensor descriptor. The returned buffer is valid for
+/// tensors with that layout. Strides are also taken into account when computing the
+/// required size.
+///
+/// @tparam DT The conceptual datatype of the elements of the tensor.
+/// @param descriptor A descriptor of the memory layout of the tensor to allocate.
+/// @throws OutOfDeviceMemoryError if memory allocation failed.
+///
+/// @see TensorDescriptor
+/// @see DeviceBuffer
+/// @see OutOfDeviceMemoryError
+/// @see hipMalloc()
+template <DataType DT>
+DeviceBuffer alloc_tensor_buffer(const TensorDescriptor<DT>& descriptor)
+{
+    return alloc_buffer(descriptor.get_element_space_size_in_bytes());
+}
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp
new file mode 100644
index 0000000000..1873af2882
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp
@@ -0,0 +1,260 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <concepts>
+
+/// This file is the main header for the CK-Builder testing system. A high-level
+/// description of this testing system is documented in
+/// `ck_tile/builder/testing/README.md`. This file deals mainly deals with the
+/// documentation of the implementation details by forward-declaring and documenting
+/// the relevant types.
+///
+/// The intention is that the basic testing strategy (explained in the testing
+/// documentation) is available for every different type of device operation. This
+/// requires us to provide some implementations in two fronts: Support for the
+/// Args, Inputs, Outputs, UniqueInputs, and UniqueOutputs for all SIGNATUREs which
+/// are supported by CK Builder, and support for invoking the different
+/// implementations returned by CK Builder, depending on the Algorithm.
+///
+/// Different SIGNATUREs may require different arguments and different (amounts of)
+/// input/output tensors. Rather than trying to cram all this in the same structure,
+/// or to provide different types, we will use dependent typing to specialize the
+/// implementation for the SIGNATURE at hand. For this reason, the Args, Inputs,
+/// Outputs, UniqueInputs, and UniqueOutputs structures are all parameterized by the
+/// SIGNATURE. The idea is to use C++20 concepts to limit the specialization to the
+/// subset of SIGNATUREs that conceptually make sense for that implementation. For
+/// example, to provide an implementation of the testing framework for forward
+/// convolutions, we can use a concept to check whether the SIGNATURE is a valid
+/// forward convolution signature:
+///
+///     template <auto SIGNATURE>
+///         requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+///     struct Args<SIGNATURE> { ... }; // Similar for the other types
+///
+/// Invocation of instances is another matter: The Builder may return instances from
+/// either CK or CK-Tile depending on the ALGORITHM configuration. The only place
+/// where this matters is the implementation of `run()`, which needs to provide a
+/// custom implementation for all instances which the Builder may return, including
+/// the reference implementation. The strategy is the same here: Use concepts to
+/// check whether the instance returned by the builder is of a particular type, and
+/// overload the `run()` function for that concept:
+///
+///     template <auto SIGNATURE, typename Conv>
+///         requires
+///             // Check that the SIGNATURE is of the type that we expect
+///             ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE> &&
+///             // Also check that the instance is of a type which we can invoke here
+///             IsCkConvInstance<SIGNATURE, Conv>
+///     void run(Conv& conv, ...);
+///
+/// Note that this is only the suggested strategy; you may also use `if constexpr`
+/// or similar to dispatch the correct implementation of the instance in the
+/// implementation of the `run()` function for a particular group of device
+/// operations.
+///
+/// The remainder of this file describes the types and functions that should be
+/// overloaded for a particular device operation, and in which situation.
+
+namespace ck_tile::builder::test {
+
+/// @brief Run-time arguments corresponding to a signature.
+///
+/// The `Args` structure is the main point of runtime configuration for a device
+/// operation. Depending on the SIGNATURE, it is used to provide the run-time
+/// parameters for a device operation, for instance, for the tensor dimensions,
+/// tensor strides, parameters such as padding, split-K batch size, fused
+/// element-wise operator instances, etc. In short, a complete run-time
+/// configuration of the tensor operation at hand.
+///
+/// This structure does not require additional member functions, any which are
+/// provided should be considered implementation details of Args structure for
+/// that particular SIGNATURE.
+///
+/// @note A good indicator of the fields necessary here are the values that should
+/// be passed to the CK `MakeArgument()` function or CK-Tile `HostArgs` structure
+/// of the device operation that you are trying to implement. It is the intention
+/// that this structure is an aggregrate so that it can be initialized using C++20
+/// designated initializers to keep the tests readable.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+template <auto SIGNATURE>
+struct Args;
+
+/// @brief Non-owning input collection corresponding to a signature.
+///
+/// The `Input` structure represents the collection of input tensor data on the
+/// device, associated to a particular SIGNATURE. The exact fields in this structure
+/// may again depend on the exact SIGNATURE. This structure is non-owning: its use
+/// is intended as a way to pass all inputs around as a single value.
+///
+/// This structure does not require additional member functions, any which are
+/// provided should be considered implementation details of Args structure for
+/// that particular SIGNATURE.
+///
+/// @note The implementation can just be a set of void-pointers which conceptually
+/// represent the inputs of the device operation. It is the intention that this
+/// structure is an aggregrate so that it can be initialized using C++20
+/// designated initializers to keep the tests readable.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+template <auto SIGNATURE>
+struct Inputs;
+
+/// @brief Non-owning outputs collection corresponding to a signature.
+///
+/// The `Output` structure represents the collection of input tensor data on the
+/// device, associated to a particular SIGNATURE. The exact fields in this structure
+/// may again depend on the exact SIGNATURE. This structure is non-owning: its use
+/// is intended as a way to pass all outputs around as a single value.
+///
+/// This structure does not require additional member functions, any which are
+/// provided should be considered implementation details of Args structure for
+/// that particular SIGNATURE.
+///
+/// @note The implementation can just be a set of void-pointers which conceptually
+/// represent the outputs of the device operation. It is the intention that this
+/// structure is an aggregrate so that it can be initialized using C++20
+/// designated initializers to keep the tests readable.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+template <auto SIGNATURE>
+struct Outputs;
+
+/// @brief RAII-enabled inputs collection corresponding to a signature.
+///
+/// The `UniqueInputs` is used to automatically manage the memory of a set of
+/// inputs. Unlike the corresponding `Inputs` structure, the implementation is
+/// opaque; the only requirements for this structure is that an instance can
+/// be created using `alloc_inputs()` and that an instance of the corresponding
+/// `Inputs` structure can be obtained using `.get()`.
+///
+/// @note The easiest way to implement this type is to use the `DeviceBuffer`
+/// type to allocate individual device buffers for each input tensor.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+///
+/// @see alloc_inputs()
+/// @see ValidUniqueInputs
+/// @see DeviceBuffer
+template <auto SIGNATURE>
+struct UniqueInputs;
+
+/// @brief RAII-enabled outputs collection corresponding to a signature.
+///
+/// The `UniqueOutputs` is used to automatically manage the memory of a set of
+/// outputs. Unlike the corresponding `Outputs` structure, the implementation is
+/// opaque; the only requirements for this structure is that an instance can
+/// be created using `alloc_outputs()` and that an instance of the corresponding
+/// `Outputs` structure can be obtained using `.get()`.
+///
+/// @note The easiest way to implement this type is to use the `DeviceBuffer`
+/// type to allocate individual device buffers for each output tensor.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+///
+/// @see alloc_outputs()
+/// @see ValidUniqueOutputs
+/// @see DeviceBuffer
+template <auto SIGNATURE>
+struct UniqueOutputs;
+
+/// @brief Concept to check the validity of `UniqueInputs`.
+///
+/// The `ValidUniqueInputs` concept can be used to check whether the definition
+/// of `UniqueInputs` is valid for a particular SIGNATURE.
+///
+/// - SIGNATURE is signature to specialize the structure for.
+///
+/// @see UniqueInputs
+template <auto SIGNATURE>
+concept ValidUniqueInputs = requires(UniqueInputs<SIGNATURE>& inputs) {
+    /// `.get()` is used to obtain a non-owning version of the `Inputs` collection.
+    { inputs.get() } -> std::convertible_to<Inputs<SIGNATURE>>;
+};
+
+/// @brief Concept to check the validity of `UniqueOutputs`.
+///
+/// The `ValidUniqueOutputs` concept can be used to check whether the definition
+/// of `UniqueOutputs` is valid for a particular SIGNATURE.
+///
+/// - SIGNATURE is signature to specialize the structure for.
+///
+/// @see UniqueOutputs
+template <auto SIGNATURE>
+concept ValidUniqueOutputs = requires(UniqueOutputs<SIGNATURE>& inputs) {
+    /// `.get()` is used to obtain a non-owning version of the `Outputs` collection.
+    { inputs.get() } -> std::convertible_to<Outputs<SIGNATURE>>;
+};
+
+/// @brief Allocate inputs corresponding to a signature.
+///
+/// The `alloc_inputs()` function is used to create an instance of
+/// `UniqueInputs`. This function uses the `args` structure to compute the
+/// amount of memory required and then allocate it on the device, for example
+/// using `alloc_buffer` or `alloc_tensor_buffer`.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+///
+/// @see Inputs
+/// @see UniqueInputs
+/// @see alloc_buffer()
+/// @see alloc_tensor_buffer()
+template <auto SIGNATURE>
+    requires ValidUniqueInputs<SIGNATURE>
+UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args);
+
+/// @brief Allocate outputs corresponding to a signature.
+///
+/// The `alloc_outputs()` function is used to create an instance of
+/// `UniqueOutputs`. This function uses the `args` structure to compute the
+/// amount of memory required and then allocate it on the device, for example
+/// using `alloc_buffer` or `alloc_tensor_buffer`.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+///
+/// @see Outputs
+/// @see UniqueOutputs
+/// @see alloc_buffer()
+/// @see alloc_tensor_buffer()
+template <auto SIGNATURE>
+    requires ValidUniqueOutputs<SIGNATURE>
+UniqueInputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args);
+
+/// @brief Invoke a device operation created by CK Builder.
+///
+/// This is the main function used to invoke a particular device operation
+/// instance created by the builder. It uses the `args`, `inputs`, and `outputs`
+/// to configure the `operation` and invokes it immediately.
+///
+/// In practice, the `Operation` is usually a CK or CK Tile device operation
+/// type, for example `DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3`.
+/// This function implements the required functionality to invoke any relevant
+/// type created by the builder.
+///
+/// @note Unlike the Args, Inputs, Outputs, and related structures, this function
+/// is specialized for the different implementations that the builder may
+/// return (see file-level documentation).
+///
+/// @pre The tensors in `inputs` should be allocated and initialized with the
+///   appropriate values to perform the operation.
+/// @pre The tensors in `outputs` should be allocated.
+/// @post The tensors in `outputs` are overwritten with the outputs of the device
+///   operation.
+///
+/// @tparam SIGNATURE the signature to specialize this function for
+/// @tparam Operation the kernel of the operation to invoke. This type should be
+///   one that is created using the Builder API.
+/// @param operation An instance of the operation to invoke.
+/// @param args The run-time arguments of the operation.
+/// @param inputs The input tensor data. Will not be modified by this function.
+/// @param outputs The output tensor data. The contents will be overwritten by
+///   this function.
+template <auto SIGNATURE, typename Operation>
+void run(Operation& operation,
+         const Args<SIGNATURE>& args,
+         const Inputs<SIGNATURE>& inputs,
+         const Outputs<SIGNATURE>& outputs);
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp b/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp
new file mode 100644
index 0000000000..14b8e75668
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp
@@ -0,0 +1,43 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstddef>
+#include "ck_tile/builder/types.hpp"
+
+/// This file implements various backend-independent traits for
+/// CK-Builder types.
+
+namespace ck_tile::builder::test {
+
+/// @brief Query the size of a data type in memory.
+///
+/// This function computes the size of a variant of `DataType` in memory.
+/// This is more complicated than it seems. For most types, this is just
+/// the size of the equivalent C++-type, but for sub-byte type we have to
+/// represent each byte by multiple values, for example. For now, we only
+/// care about types which consist of an integral number of bytes, though.
+///
+/// @note The details of this function are likely going to change with the
+/// support of sub-byte types.
+///
+/// @param data_type The type to query the in-memory size of.
+/// @returns The number of bytes that an element of this data type requires
+///   in memory.
+constexpr size_t data_type_sizeof(DataType data_type)
+{
+    switch(data_type)
+    {
+    case DataType::UNDEFINED_DATA_TYPE: return 0;
+    case DataType::FP32: return 4;
+    case DataType::FP16: return 2;
+    case DataType::BF16: return 2;
+    case DataType::FP8: return 1;
+    case DataType::INT32: return 4;
+    case DataType::I8: return 1;
+    case DataType::U8: return 1;
+    }
+}
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt
index eef1110d27..ada97afa2b 100644
--- a/experimental/builder/test/CMakeLists.txt
+++ b/experimental/builder/test/CMakeLists.txt
@@ -78,24 +78,27 @@ add_ck_builder_test(test_ckb_conv_builder
     test_fwd_instance_traits.cpp
     test_bwd_data_instance_traits.cpp
     test_instance_traits_util.cpp
-
+    unit_device_buffer.cpp
+    unit_tensor_descriptor.cpp
     unit_conv_elementwise_op.cpp
     unit_conv_tensor_layout.cpp
     unit_conv_tensor_type.cpp
     unit_conv_thread_block.cpp
-    unit_conv_tuning_params.cpp)
-    
-    # Tests the inline diff utility used for comparing strings in tests assertions
-    add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
+    unit_conv_tuning_params.cpp
+    unit_conv_fwd_testing.cpp)
+target_link_libraries(test_ckb_conv_builder PRIVATE utility)
+
+# Tests the inline diff utility used for comparing strings in tests assertions
+add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
+
+# Tests convolution trait selection and configuration
+add_ck_builder_test(test_ckb_conv_traits
+    conv/ck/test_conv_traits.cpp)
+
+# Tests convolution problem description and parameter handling
+add_ck_builder_test(test_ckb_conv_description
+    test_conv_description.cpp)
 
-    # Tests convolution trait selection and configuration
-    add_ck_builder_test(test_ckb_conv_traits
-        conv/ck/test_conv_traits.cpp)
-    
-    # Tests convolution problem description and parameter handling
-    add_ck_builder_test(test_ckb_conv_description
-        test_conv_description.cpp)
-    
 ################################################################################
 # REGRESSION TESTS - Integration Tests (With Kernel Compilation)
 ################################################################################
@@ -134,8 +137,8 @@ add_ck_builder_test(test_ckb_build_fwd_instances
     conv/ck/test_ckb_conv_fwd_3d_fp32.cpp
     conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
     conv/ck_tile/test_ckb_conv_bwd_weight_2d_fp16_v3.cpp
-    conv/ck_tile/test_ckb_conv_bwd_data_2d_fp16_v3.cpp
-    )
+    conv/ck_tile/test_ckb_conv_bwd_data_2d_fp16_v3.cpp)
+target_link_libraries(test_ckb_build_fwd_instances PRIVATE utility)
 
 
 ################################################################################
diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp
index bba0128810..b7eacf5643 100644
--- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp
+++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp
@@ -4,46 +4,83 @@
 #include "utils/ckb_conv_test_configs.hpp"
 #include "utils/ckb_conv_test_utils.hpp"
 #include "utils/conv_algorithm_type_utils.hpp"
+#include "ck_tile/builder/testing/conv_fwd_ck.hpp"
+#include "ck_tile/host/device_prop.hpp"
 
-namespace {
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+namespace cku = ck_tile::builder::test_utils;
 
-using namespace ck_tile::builder::test_utils;
+constexpr auto SIGNATURE =
+    ckt::ConvSignature{.spatial_dim            = 2,
+                       .direction              = ckb::ConvDirection::FORWARD,
+                       .data_type              = ckb::DataType::FP16,
+                       .accumulation_data_type = ckb::DataType::FP32,
+                       .input                  = {.config = {.layout = ckb::TensorLayout::GNHWC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::GNHWK}}};
 
-TEST(FwdConvInstances,
-     Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP16_GNHWC)
+constexpr auto ALGORITHM = cku::ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3{}
+                               .with_thread_block(cku::FwdThreadBlock_256_256x256x32)
+                               .with_gemm_config(cku::FwdGemmParams_Xdl_4x4_per_wave)
+                               .with_transfer(cku::FwdTransfer_4x64x1)
+                               .with_specializations(ckb::ConvFwdSpecialization::DEFAULT,
+                                                     ckb::GemmSpecialization::MNKPadding)
+                               .with_block_gemm(cku::BlockGemmDesc_v3_intrawave);
+
+using Builder  = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
+using Instance = Builder::Instance;
+
+TEST(Fwd2DFp16_CShufV3_GNHWC, Create)
 {
-    using enum ck_tile::builder::ConvDirection;
-    using enum ck_tile::builder::DataType;
-    using enum ck_tile::builder::TensorLayout;
-
-    constexpr ConvSignature FwdConvSignature{.spatial_dim            = 2,
-                                             .direction              = FORWARD,
-                                             .data_type              = FP16,
-                                             .accumulation_data_type = FP32,
-                                             .input  = {.config = {.layout = GNHWC}},
-                                             .weight = {.config = {.layout = GKYXC}},
-                                             .output = {.config = {.layout = GNHWK}}};
-
-    constexpr auto FwdConvAlgorithm =
-        ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3{}
-            .with_thread_block(FwdThreadBlock_256_256x256x32)
-            .with_gemm_config(FwdGemmParams_Xdl_4x4_per_wave)
-            .with_transfer(FwdTransfer_4x64x1)
-            .with_specializations(ConvFwdSpecialization::FILTER_1X1_PAD0,
-                                  GemmSpecialization::MNKPadding)
-            .with_block_gemm(BlockGemmDesc_v3_intrawave);
-
-    using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
-
-    const auto expected_transfer_parameters = to_string(FwdConvAlgorithm);
-    run_test<Builder>({"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3",
-                       expected_transfer_parameters,
-                       "Filter1x1Pad0",
-                       "Intrawave",
-                       "v3",
-                       "GNHWC,GKYXC,EmptyTuple,GNHWK",
-                       "PassThrough,PassThrough,PassThrough",
-                       "MNKPadding"});
+    const auto expected_transfer_parameters = to_string(ALGORITHM);
+    cku::run_test<Builder>({"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3",
+                            expected_transfer_parameters,
+                            "Default",
+                            "Intrawave",
+                            "v3",
+                            "GNHWC,GKYXC,EmptyTuple,GNHWK",
+                            "PassThrough,PassThrough,PassThrough",
+                            "MNKPadding"});
 }
 
-} // namespace
+TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd)
+{
+    if(!ck_tile::get_device_name().starts_with("gfx9"))
+    {
+        GTEST_SKIP() << "unsupported architecture";
+    }
+
+    ckt::Args<SIGNATURE> args = {
+        .lengths =
+            {
+                .batch_size      = 16,
+                .groups          = 1,
+                .input_channels  = 32,
+                .output_channels = 48,
+                .image =
+                    {
+                        .width  = 56,
+                        .height = 64,
+                    },
+                .filter =
+                    {
+                        .width  = 3,
+                        .height = 5,
+                    },
+            },
+        .filter_strides     = {.width = 1, .height = 1},
+        .filter_dilation    = {.width = 1, .height = 1},
+        .input_left_pad     = {.width = 0, .height = 0},
+        .input_right_pad    = {.width = 0, .height = 0},
+        .a_elementwise_op   = {},
+        .b_elementwise_op   = {},
+        .cde_elementwise_op = {},
+    };
+
+    auto inputs  = alloc_inputs(args);
+    auto outputs = alloc_outputs(args);
+
+    auto conv = Instance{};
+    ckt::run(conv, args, inputs.get(), outputs.get());
+}
diff --git a/experimental/builder/test/test_testing_utils.cpp b/experimental/builder/test/test_testing_utils.cpp
index dd65f3f327..43bbbd69eb 100644
--- a/experimental/builder/test/test_testing_utils.cpp
+++ b/experimental/builder/test/test_testing_utils.cpp
@@ -5,6 +5,8 @@
 
 #include "testing_utils.hpp"
 
+using ck_tile::test::HipError;
+using ck_tile::test::HipSuccess;
 using ck_tile::test::InstanceMatcher;
 using ck_tile::test::InstanceSet;
 using ck_tile::test::StringEqWithDiff;
@@ -96,3 +98,12 @@ TEST(InstanceMatcher, ExplainMatchResult)
                                  "Unexpected: 1\n"
                                  "- python\n"));
 }
+
+TEST(HipStatusMatcher, Basic)
+{
+    EXPECT_THAT(hipSuccess, HipSuccess());
+    EXPECT_THAT(hipErrorInvalidValue, HipError(hipErrorInvalidValue));
+    EXPECT_THAT(hipErrorInvalidValue, Not(HipSuccess()));
+    EXPECT_THAT(hipSuccess, Not(HipError(hipErrorInvalidValue)));
+    EXPECT_THAT(hipErrorOutOfMemory, Not(HipError(hipErrorInvalidValue)));
+}
diff --git a/experimental/builder/test/testing_utils.cpp b/experimental/builder/test/testing_utils.cpp
index 98b404a78f..b60c35333e 100644
--- a/experimental/builder/test/testing_utils.cpp
+++ b/experimental/builder/test/testing_utils.cpp
@@ -11,6 +11,11 @@
 #include <vector>
 #include <algorithm>
 
+std::ostream& operator<<(std::ostream& os, hipError_t status)
+{
+    return os << hipGetErrorString(status);
+}
+
 namespace ck_tile::test {
 
 // Wagner-Fischer Algorithm for Computing Edit Distance and Inline Diff
@@ -297,4 +302,41 @@ void InstanceMatcher::DescribeNegationTo(std::ostream* os) const
     *os << "is not equal to " << expected_;
 }
 
+bool HipStatusMatcher::MatchAndExplain(hipError_t actual,
+                                       ::testing::MatchResultListener* listener) const
+{
+    (void)listener;
+
+    if(actual == expected_)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+void HipStatusMatcher::DescribeTo(std::ostream* os) const { *os << hipGetErrorString(expected_); }
+
+void HipStatusMatcher::DescribeNegationTo(std::ostream* os) const
+{
+    if(expected_ == hipSuccess)
+    {
+        *os << "any error";
+    }
+    else
+    {
+        *os << "isn't equal to " << hipGetErrorString(expected_);
+    }
+}
+
+::testing::Matcher<hipError_t> HipSuccess()
+{
+    return ::testing::MakeMatcher(new HipStatusMatcher(hipSuccess));
+}
+
+::testing::Matcher<hipError_t> HipError(hipError_t error)
+{
+    return ::testing::MakeMatcher(new HipStatusMatcher(error));
+}
+
 } // namespace ck_tile::test
diff --git a/experimental/builder/test/testing_utils.hpp b/experimental/builder/test/testing_utils.hpp
index 23eec7aff2..7a03851ac4 100644
--- a/experimental/builder/test/testing_utils.hpp
+++ b/experimental/builder/test/testing_utils.hpp
@@ -11,6 +11,16 @@
 #include <vector>
 #include <array>
 
+/// @brief ostream-overload for hipError
+///
+/// Google Test likes to print errors to ostream, and this provides integration
+/// with that. Since we only expect to use this with CK-Builder's own tests,
+/// providing this implementation seems not problematic, but if it starts to
+/// clash with another implementation then we will need to provide this
+/// implementation another way. Unfortunately Google Test does not have a
+/// dedicated function to override to provide printing support.
+std::ostream& operator<<(std::ostream& os, hipError_t status);
+
 namespace ck_tile::test {
 
 static bool isTerminalOutput() { return isatty(fileno(stdout)) || isatty(fileno(stderr)); }
@@ -109,4 +119,35 @@ struct InstanceMatcher : public ::testing::MatcherInterface<InstanceSet>
 
 ::testing::Matcher<InstanceSet> InstancesMatch(const InstanceSet& expected);
 
+/// @brief Google Test hipError_t matcher.
+///
+/// This is a custom Google Test matcher implementation which can be used to
+/// compare HIP status codes. Use `HipSuccess()` or `HipError()` to obtain
+/// an instance.
+///
+/// @see HipSuccess
+/// @see HipError
+/// @see ::testing::MatcherInterface
+struct HipStatusMatcher : public ::testing::MatcherInterface<hipError_t>
+{
+    HipStatusMatcher(hipError_t expected) : expected_(expected) {}
+
+    bool MatchAndExplain(hipError_t actual,
+                         ::testing::MatchResultListener* listener) const override;
+    void DescribeTo(std::ostream* os) const override;
+    void DescribeNegationTo(std::ostream* os) const override;
+
+    hipError_t expected_;
+};
+
+/// @brief Construct a Google Test matcher that checks that a HIP operation
+/// was successful.
+::testing::Matcher<hipError_t> HipSuccess();
+
+/// @brief Construct a Google Test matcher that checks that a HIP operation
+/// returned a particular error code.
+///
+/// @param error The error to expect.
+::testing::Matcher<hipError_t> HipError(hipError_t error);
+
 } // namespace ck_tile::test
diff --git a/experimental/builder/test/unit_conv_fwd_testing.cpp b/experimental/builder/test/unit_conv_fwd_testing.cpp
new file mode 100644
index 0000000000..3243935ca5
--- /dev/null
+++ b/experimental/builder/test/unit_conv_fwd_testing.cpp
@@ -0,0 +1,83 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "impl/conv_signature_types.hpp"
+#include "testing_utils.hpp"
+#include "ck_tile/builder/testing/conv_fwd.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <vector>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using ::testing::ElementsAreArray;
+using ::testing::NotNull;
+
+constexpr auto SIGNATURE =
+    ckt::ConvSignature{.spatial_dim            = 2,
+                       .direction              = ckb::ConvDirection::FORWARD,
+                       .data_type              = ckb::DataType::BF16,
+                       .accumulation_data_type = ckb::DataType::FP32,
+                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
+
+constexpr ckt::Args<SIGNATURE> ARGS = {
+    .lengths =
+        {
+            .batch_size      = 17,
+            .groups          = 5,
+            .input_channels  = 13,
+            .output_channels = 44,
+            .image =
+                {
+                    .width  = 99,
+                    .height = 125,
+                },
+            .filter =
+                {
+                    .width  = 9,
+                    .height = 4,
+                },
+        },
+    .filter_strides     = {.width = 1, .height = 1},
+    .filter_dilation    = {.width = 1, .height = 1},
+    .input_left_pad     = {.width = 0, .height = 0},
+    .input_right_pad    = {.width = 0, .height = 0},
+    .a_elementwise_op   = {},
+    .b_elementwise_op   = {},
+    .cde_elementwise_op = {},
+};
+
+using Inputs        = ckt::Inputs<SIGNATURE>;
+using Outputs       = ckt::Outputs<SIGNATURE>;
+using UniqueInputs  = ckt::UniqueInputs<SIGNATURE>;
+using UniqueOutputs = ckt::UniqueOutputs<SIGNATURE>;
+
+static_assert(ckt::ValidUniqueInputs<SIGNATURE>);
+static_assert(ckt::ValidUniqueOutputs<SIGNATURE>);
+
+TEST(ConvFwdTesting, MakeDescriptors)
+{
+    const auto get_lengths = [](const auto& descriptor) {
+        const auto lengths = descriptor.get_lengths();
+        // Google Test cannot print std::span, so turn it into a vector for
+        // legibility.
+        return std::vector(lengths.begin(), lengths.end());
+    };
+
+    EXPECT_THAT(get_lengths(ARGS.make_input_descriptor()), ElementsAreArray({5, 17, 13, 125, 99}));
+    EXPECT_THAT(get_lengths(ARGS.make_weight_descriptor()), ElementsAreArray({5, 44, 13, 4, 9}));
+    EXPECT_THAT(get_lengths(ARGS.make_output_descriptor()), ElementsAreArray({5, 17, 44, 122, 91}));
+}
+
+TEST(ConvFwdTesting, Alloc)
+{
+    auto inputs  = alloc_inputs(ARGS);
+    auto outputs = alloc_outputs(ARGS);
+
+    EXPECT_THAT(inputs.get().input, NotNull());
+    EXPECT_THAT(inputs.get().weight, NotNull());
+    EXPECT_THAT(outputs.get().output, NotNull());
+}
diff --git a/experimental/builder/test/unit_device_buffer.cpp b/experimental/builder/test/unit_device_buffer.cpp
new file mode 100644
index 0000000000..75408acc16
--- /dev/null
+++ b/experimental/builder/test/unit_device_buffer.cpp
@@ -0,0 +1,81 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <vector>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using ck_tile::test::HipError;
+using ck_tile::test::HipSuccess;
+using ::testing::Eq;
+using ::testing::IsNull;
+using ::testing::NotNull;
+using ::testing::Throws;
+
+TEST(DeviceBuffer, DefaultToNull)
+{
+    ckt::DeviceBuffer buffer;
+    EXPECT_THAT(buffer.get(), IsNull());
+}
+
+TEST(DeviceBuffer, AllocBuffer)
+{
+    const auto size = 12345;
+    auto buffer     = ckt::alloc_buffer(size);
+
+    // Pointer should be non-null
+    EXPECT_THAT(buffer.get(), NotNull());
+
+    // Actually, the pointer should be a device pointer
+    hipPointerAttribute_t attr;
+    EXPECT_THAT(hipPointerGetAttributes(&attr, buffer.get()), HipSuccess());
+
+    EXPECT_THAT(attr.devicePointer, NotNull());
+    EXPECT_THAT(attr.type, Eq(hipMemoryTypeDevice));
+
+    // Memory should be writable without error
+    EXPECT_THAT(hipMemset(buffer.get(), 0xFF, size), HipSuccess());
+}
+
+TEST(DeviceBuffer, AutoFree)
+{
+    const auto size = 12345;
+    std::byte* ptr  = nullptr;
+
+    {
+        auto buffer = ckt::alloc_buffer(size);
+        ptr         = buffer.get();
+    }
+
+    // Trying to use a pointer after freeing should return en error in HIP.
+    EXPECT_THAT(hipMemset(ptr, 0xFF, size), HipError(hipErrorInvalidValue));
+}
+
+TEST(DeviceBuffer, ThrowsOnOom)
+{
+    const auto size = size_t{1} << 60; // 1 exabyte
+
+    auto check = [] { auto buffer = ckt::alloc_buffer(size); };
+    EXPECT_THAT(check, Throws<ckt::OutOfDeviceMemoryError>());
+}
+
+TEST(DeviceBuffer, AllocTensorBuffer)
+{
+    std::vector<size_t> lengths = {128, 128, 128};
+    std::vector<size_t> strides = {128 * 128, 128, 1};
+    ckt::TensorDescriptor<ckb::DataType::FP32> descriptor(lengths, strides);
+
+    auto buffer = ckt::alloc_tensor_buffer(descriptor);
+
+    // Pointer should be non-null
+    EXPECT_THAT(buffer.get(), NotNull());
+
+    // Memory should be writable without error
+    EXPECT_THAT(hipMemset(buffer.get(), 0xFF, descriptor.get_element_space_size_in_bytes()),
+                HipSuccess());
+}
diff --git a/experimental/builder/test/unit_tensor_descriptor.cpp b/experimental/builder/test/unit_tensor_descriptor.cpp
new file mode 100644
index 0000000000..07abfe44bd
--- /dev/null
+++ b/experimental/builder/test/unit_tensor_descriptor.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <vector>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using ::testing::ElementsAreArray;
+using ::testing::Ge;
+
+TEST(TensorDescriptor, Basic)
+{
+    constexpr auto dt           = ckb::DataType::FP16;
+    std::vector<size_t> lengths = {123, 456, 789};
+    std::vector<size_t> strides = {456 * 789, 789, 1};
+
+    ckt::TensorDescriptor<dt> descriptor(lengths, strides);
+
+    EXPECT_THAT(descriptor.get_lengths(), ElementsAreArray(lengths));
+    EXPECT_THAT(descriptor.get_strides(), ElementsAreArray(strides));
+}
+
+TEST(TensorDescriptor, ComputeSize)
+{
+    constexpr auto dt           = ckb::DataType::FP32;
+    std::vector<size_t> lengths = {305, 130, 924};
+    std::vector<size_t> strides = {1000 * 1000, 1, 1000};
+
+    ckt::TensorDescriptor<dt> descriptor(lengths, strides);
+
+    // Compute the location of the last item in memory, then add one
+    // to get the minimum size.
+    size_t expected_size = 1;
+    for(size_t i = 0; i < lengths.size(); ++i)
+    {
+        expected_size += (lengths[i] - 1) * strides[i];
+    }
+
+    EXPECT_THAT(descriptor.get_element_space_size(), Ge(expected_size));
+    EXPECT_THAT(descriptor.get_element_space_size_in_bytes(),
+                Ge(expected_size * ckt::data_type_sizeof(dt)));
+}
diff --git a/script/check_copyright_year.sh b/script/check_copyright_year.sh
index 48c050c76b..03bf5db841 100755
--- a/script/check_copyright_year.sh
+++ b/script/check_copyright_year.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 

From 21f06aa47ded64b9a07d81bf4b743c21462178db Mon Sep 17 00:00:00 2001
From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com>
Date: Sun, 14 Dec 2025 19:25:47 +0100
Subject: [PATCH 06/10] CK Tile: Enable padding blockscale example (#3417)

* Fix host code padding

* restructure the ref code

* clean up

* Fix compilation error

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 .../38_block_scale_gemm/gemm_utils.hpp        |  2 +-
 .../run_gemm_quant_example.inc                | 19 ++--
 .../ck_tile/host/reference/reference_gemm.hpp | 99 ++++++++++---------
 3 files changed, 58 insertions(+), 62 deletions(-)

diff --git a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
index aabbfff3bd..7a4760e1da 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -91,7 +91,7 @@ struct GemmConfigBase
 {
     static constexpr bool kPadM = false;
     static constexpr bool kPadN = false;
-    static constexpr bool kPadK = false;
+    static constexpr bool kPadK = true;
 
     static constexpr bool PermuteA = false;
     static constexpr bool PermuteB = false;
diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
index fa5e1f12e3..a0e875448d 100644
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
@@ -391,25 +391,18 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
     ck_tile::index_t N = arg_parser.get_int("n");
     ck_tile::index_t K = arg_parser.get_int("k");
 
-    if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped ||
-                 QuantMode == ck_tile::QuantType::BQuantGrouped)
-    {
-        if(K % QuantGroupSize::kK != 0)
-        {
-            throw std::runtime_error(
-                "K must be aligned with QuantGroupSize for AQuantGrouped/BQuantGrouped mode");
-        }
-    }
     ck_tile::index_t AQK, BQK, BQN = 0;
     if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
     {
-        AQK = K / QuantGroupSize::kK; // Group quantization: AQK = K / GroupSize
-        BQK = 0;                      // No B quantization
+        AQK = ck_tile::integer_divide_ceil(
+            K, QuantGroupSize::kK); // Group quantization: AQK = K / GroupSize
+        BQK = 0;                    // No B quantization
     }
     else if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
     {
-        AQK = 0;                      // No A quantization
-        BQK = K / QuantGroupSize::kK; // Group quantization: BQK = K / GroupSize
+        AQK = 0; // No A quantization
+        BQK = ck_tile::integer_divide_ceil(
+            K, QuantGroupSize::kK); // Group quantization: BQK = K / GroupSize
         BQN = ck_tile::integer_divide_ceil(N, QuantGroupSize::kN);
     }
     else if constexpr(QuantMode == ck_tile::QuantType::RowColQuant ||
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index 0aa296b8d9..8b0e3028ae 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -34,77 +34,80 @@ CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
     const std::size_t K = a_m_k.get_length(1);
 
     auto f_mn = [&](auto m, auto n) {
-        AccDataType v_acc = 0, v_block_acc = 0;
+        AccDataType v_acc = 0;
 
-        static_assert(std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
-                      std::is_same_v<ADataType, bf8_t>);
-        static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t> ||
-                      std::is_same_v<BDataType, pk_int4_t>);
-        static_assert(std::is_same_v<AccDataType, float>);
-        static_assert(std::is_same_v<CDataType, float> ||
-                      std::is_same_v<CDataType, ck_tile::half_t>);
-        for(std::size_t k = 0; k < K; ++k)
-        {
-            AccDataType v_a;
-            AccDataType v_b;
+        constexpr std::size_t kGroupK = QuantGroupSize::kK;
+
+        // ---- A loader: dequant A(m,k) into AccDataType ----
+        auto load_a = [&](std::size_t k) -> AccDataType {
             if constexpr(std::is_same_v<ADataType, pk_int4_t>)
             {
                 const pk_int4_t pk_val  = a_element_op(a_m_k(m, k));
                 const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
-                if(k % 2 == 1)
-                    v_a = fp32_val.hi;
-                else
-                    v_a = fp32_val.lo;
+                return (k & 1) ? fp32_val.hi : fp32_val.lo;
             }
             else
             {
-                v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
+                return ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
             }
+        };
+
+        // ---- B loader: dequant B(k,n) into AccDataType ----
+        auto load_b = [&](std::size_t k) -> AccDataType {
             if constexpr(std::is_same_v<BDataType, pk_int4_t>)
             {
                 const pk_int4_t pk_val  = b_element_op(b_k_n(k, n));
                 const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
-                if(k % 2 == 1)
-                    v_b = fp32_val.hi;
-                else
-                    v_b = fp32_val.lo;
+                return (k & 1) ? fp32_val.hi : fp32_val.lo;
             }
             else if constexpr(std::is_same_v<BDataType, fp8_t>)
             {
-                v_b = fp8_to_float_raw(b_element_op(b_k_n(k, n)));
+                return fp8_to_float_raw(b_element_op(b_k_n(k, n)));
             }
             else
             {
-                v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
+                return ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
             }
-            v_block_acc += v_a * v_b;
+        };
 
-            // Apply group dequant scale
-            if((k + 1) % QuantGroupSize::kK == 0)
+        // ---- scale loader for a given K-group index ----
+        auto load_scale = [&](ck_tile::index_t k_group) -> float {
+            const ck_tile::index_t outer_dim = aquant ? (m / QuantGroupSize::kM) : k_group;
+            const ck_tile::index_t inner_dim = aquant ? k_group : (n / QuantGroupSize::kN);
+
+            if constexpr(std::is_same_v<QDataType, float>)
             {
-                float scale       = 0.f;
-                index_t outer_dim = (aquant) ? (m / QuantGroupSize::kM) : (k / QuantGroupSize::kK);
-                index_t inner_dim = (aquant) ? (k / QuantGroupSize::kK) : (n / QuantGroupSize::kN);
-                if constexpr(std::is_same_v<QDataType, float>)
-                {
-                    scale = q(outer_dim, inner_dim);
-                }
-                else if constexpr(std::is_same_v<QDataType, ck_tile::fp8_t>)
-                {
-                    scale = fp8_to_float_raw(q(outer_dim, inner_dim));
-                }
-                else if constexpr(std::is_same_v<QDataType, ck_tile::bf8_t>)
-                {
-                    scale = bf8_to_float_raw(q(outer_dim, inner_dim));
-                }
-                else
-                {
-                    static_assert(false, "Unexpected Q datatype.");
-                }
-                v_block_acc *= scale;
-                v_acc += v_block_acc;
-                v_block_acc = 0;
+                return q(outer_dim, inner_dim);
             }
+            else if constexpr(std::is_same_v<QDataType, ck_tile::fp8_t>)
+            {
+                return fp8_to_float_raw(q(outer_dim, inner_dim));
+            }
+            else // QDataType == bf8_t by static_assert above
+            {
+                return bf8_to_float_raw(q(outer_dim, inner_dim));
+            }
+        };
+
+        // ---- Loop over K by groups (full and tail) ----
+        for(std::size_t k_begin = 0; k_begin < K; k_begin += kGroupK)
+        {
+            const std::size_t k_end = std::min<std::size_t>(k_begin + kGroupK, K);
+
+            AccDataType v_block_acc = 0;
+
+            // unscaled accumulation within this K-group
+            for(std::size_t k = k_begin; k < k_end; ++k)
+            {
+                const AccDataType v_a = load_a(k);
+                const AccDataType v_b = load_b(k);
+                v_block_acc += v_a * v_b;
+            }
+
+            const ck_tile::index_t k_group = static_cast<ck_tile::index_t>(k_begin / kGroupK);
+            const float scale              = load_scale(k_group);
+
+            v_acc += v_block_acc * scale;
         }
 
         c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));

From 9ac51aa0f44bae776609036f291c3cd2666e84ee Mon Sep 17 00:00:00 2001
From: John Shumway <jshumway@amd.com>
Date: Sun, 14 Dec 2025 12:49:12 -0800
Subject: [PATCH 07/10] Add describe() method to device ops for runtime
 introspection (#3375)

Introduces a polymorphic describe() method to BaseOperator that enables runtime introspection of kernel configurations through a unified interface.

Key changes:

* Add virtual describe() method to BaseOperator returning Description objects
* Implement describe() in 6 device operation classes (conv fwd/bwd variants)
* Create conv_describe.hpp with factory function for ConvDescription
* Extract type definitions to conv_types.hpp to resolve circular dependencies
* Add InstanceStringDescription for kernels without full ConvDescription support

Other Improvements:

* Update tests to use describe() instead of GetInstanceString()
* Remove circular dependency include from conv_traits.hpp
* Add ODD_C to ConvFwdSpecialization enum and fix OddC mapping
* Replace silent fallback in conv_layout() with compile-time error

This provides a foundation for runtime kernel introspection and better tooling support for analyzing and debugging kernel configurations.
---
 .../factory/helpers/ck/conv_tuning_params.hpp |   1 +
 .../ck_tile/builder/reflect/conv_describe.hpp |  49 +++
 .../builder/reflect/conv_description.hpp      |  38 +--
 .../ck_tile/builder/reflect/conv_traits.hpp   | 312 ++++++++++--------
 .../ck_tile/builder/reflect/conv_types.hpp    | 109 ++++++
 .../ck_tile/builder/reflect/description.hpp   |  31 ++
 .../builder/include/ck_tile/builder/types.hpp |  82 ++++-
 .../builder/test/conv/ck/test_conv_traits.cpp |   1 +
 .../builder/test/test_conv_description.cpp    |   5 +-
 ...nstance_string_bwd_weight_grp_conv_xdl.cpp |   8 +-
 .../test_instance_string_fwd_grp_conv.cpp     |  17 +-
 .../test_instance_string_fwd_grp_conv_dl.cpp  |   8 +-
 ...tance_string_fwd_grp_conv_large_tensor.cpp |  13 +-
 .../test_instance_string_fwd_grp_conv_v3.cpp  |   9 +-
 ...test_instance_string_fwd_grp_conv_wmma.cpp |   8 +-
 .../gpu/device/device_base.hpp                |  11 +
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp |   6 +
 ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp |   6 +
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |  16 +
 ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp |   7 +
 ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp |   6 +
 ...d_multiple_d_xdl_large_tensor_cshuffle.hpp |  17 +
 22 files changed, 549 insertions(+), 211 deletions(-)
 create mode 100644 experimental/builder/include/ck_tile/builder/reflect/conv_describe.hpp
 create mode 100644 experimental/builder/include/ck_tile/builder/reflect/conv_types.hpp

diff --git a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tuning_params.hpp b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tuning_params.hpp
index 3ec0a94960..db741f2112 100644
--- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tuning_params.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tuning_params.hpp
@@ -153,6 +153,7 @@ consteval ck::tensor_operation::device::ConvolutionForwardSpecialization SetFwdC
     case ConvFwdSpecialization::FILTER_1X1_PAD0: return ck_conv_spec::Filter1x1Pad0;
     case ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0: return ck_conv_spec::Filter1x1Stride1Pad0;
     case ConvFwdSpecialization::FILTER_3x3: return ck_conv_spec::Filter3x3;
+    case ConvFwdSpecialization::ODD_C: return ck_conv_spec::OddC;
     default: throw "Unknown ConvFwdSpecialization";
     }
 }
diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_describe.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_describe.hpp
new file mode 100644
index 0000000000..fdbfa7c4e1
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_describe.hpp
@@ -0,0 +1,49 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// @file
+/// @brief Implementation of the describe() function template for convolution kernels
+
+#pragma once
+
+#include "ck_tile/builder/reflect/conv_description.hpp"
+#include "ck_tile/builder/reflect/conv_traits.hpp"
+
+namespace ck_tile::reflect {
+
+/// @brief Factory function to create ConvDescription from a convolution instance type
+/// @tparam Instance The convolution instance type (must have ConvTraits)
+/// @return A ConvDescription object populated with the instance's configuration details
+template <conv::HasConvTraits Instance>
+conv::ConvDescription describe()
+{
+    using Traits = conv::ConvTraits<Instance>;
+
+    return conv::ConvDescription(
+        conv::ConvSignatureInfo{
+            .spatial_dim       = Traits::spatial_dim,
+            .direction         = Traits::direction,
+            .input_layout      = Traits::layout[0],
+            .weight_layout     = Traits::layout[1],
+            .output_layout     = Traits::layout[2],
+            .data_type         = Traits::data_type,
+            .input_element_op  = Traits::input_element_op,
+            .weight_element_op = Traits::weight_element_op,
+            .output_element_op = Traits::output_element_op,
+        },
+        conv::GemmAlgorithmInfo{
+            .thread_block_size   = Traits::thread_block_size,
+            .tile_dims           = Traits::tile_dims,
+            .warp_gemm           = Traits::warp_gemm,
+            .a_tile_transfer     = Traits::a_tile_transfer,
+            .b_tile_transfer     = Traits::b_tile_transfer,
+            .c_tile_transfer     = Traits::c_tile_transfer,
+            .pipeline_version    = Traits::pipeline_version,
+            .pipeline_scheduler  = Traits::pipeline_scheduler,
+            .conv_specialization = Traits::conv_specialization,
+            .padding             = Traits::gemm_padding,
+        },
+        []() { return reflect::instance_string<Instance>(); });
+}
+
+} // namespace ck_tile::reflect
diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
index 59ff83c238..46c9bb488e 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
@@ -25,7 +25,7 @@
 #include <functional>
 
 #include <ck_tile/builder/conv_signature_concepts.hpp>
-#include <ck_tile/builder/reflect/conv_traits.hpp>
+#include <ck_tile/builder/reflect/conv_types.hpp>
 #include <ck_tile/builder/reflect/description.hpp>
 #include <ck_tile/builder/reflect/instance_traits.hpp>
 #include <ck_tile/builder/reflect/tree_formatter.hpp>
@@ -249,41 +249,7 @@ class ConvDescription : public Description
     GemmAlgorithmInfo algorithm_;
     std::function<std::string()> instance_string_getter_;
 };
+
 } // namespace conv
 
-/// @brief Factory function to create ConvDescription from a convolution instance type
-/// @tparam Instance The convolution instance type (must have ConvTraits specialization)
-/// @return A ConvDescription object populated with the instance's configuration details
-template <conv::HasConvTraits Instance>
-conv::ConvDescription describe()
-{
-    using Traits = conv::ConvTraits<Instance>;
-
-    return conv::ConvDescription(
-        conv::ConvSignatureInfo{
-            .spatial_dim       = Traits::spatial_dim,
-            .direction         = Traits::direction,
-            .input_layout      = Traits::layout[0],
-            .weight_layout     = Traits::layout[1],
-            .output_layout     = Traits::layout[2],
-            .data_type         = Traits::data_type,
-            .input_element_op  = Traits::input_element_op,
-            .weight_element_op = Traits::weight_element_op,
-            .output_element_op = Traits::output_element_op,
-        },
-        conv::GemmAlgorithmInfo{
-            .thread_block_size   = Traits::thread_block_size,
-            .tile_dims           = Traits::tile_dims,
-            .warp_gemm           = Traits::warp_gemm,
-            .a_tile_transfer     = Traits::a_tile_transfer,
-            .b_tile_transfer     = Traits::b_tile_transfer,
-            .c_tile_transfer     = Traits::c_tile_transfer,
-            .pipeline_version    = Traits::pipeline_version,
-            .pipeline_scheduler  = Traits::pipeline_scheduler,
-            .conv_specialization = Traits::conv_specialization,
-            .padding             = Traits::gemm_padding,
-        },
-        []() { return reflect::instance_string<Instance>(); });
-}
-
 } // namespace ck_tile::reflect
diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
index e5a5638887..ab1d1d76ed 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
@@ -10,8 +10,8 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/utility/pipeline_enum.hpp"
 #include "ck/utility/scheduler_enum.hpp"
-#include "ck_tile/builder/conv_builder.hpp"
 #include "ck_tile/builder/conv_signature_concepts.hpp"
+#include "ck_tile/builder/reflect/conv_types.hpp"
 #include "ck_tile/builder/reflect/instance_traits.hpp"
 #include "ck_tile/builder/reflect/instance_traits_util.hpp"
 #include "ck_tile/builder/types.hpp"
@@ -161,103 +161,19 @@ constexpr auto convert_pipeline_scheduler()
     }
 }
 
-/// @brief Helper structures for organizing trait data with domain-specific naming
-
-/// @brief Data tile dimensions processed by a workgroup.
-/// @details This struct defines the M, N, and K dimensions of the data tile
-/// that a single workgroup (thread block) is responsible for processing in the
-/// underlying GEMM computation.
-struct DataTileInfo
-{
-    int m; ///< M dimension of the tile processed by the workgroup (MPerBlock).
-    int n; ///< N dimension of the tile processed by the workgroup (NPerBlock).
-    int k; ///< K dimension of the tile processed by the workgroup (KPerBlock).
-};
-
-/// @brief Dimensions for an input data tile transfer.
-/// @details Defines the shape of the input tile (A or B matrix) as it is
-/// transferred from global memory to LDS. The tile is conceptually divided
-/// into k0 and k1 dimensions.
-struct InputTileTransferDimensions
-{
-    int k0;     ///< The outer dimension of K, where K = k0 * k1.
-    int m_or_n; ///< The M dimension for the A matrix transfer, or the N dimension for the B matrix.
-    int k1; ///< The inner dimension of K, often corresponding to the vector load size from global
-            ///< memory.
-};
-
-/// @brief Parameters governing the transfer of an input tile.
-/// @details This struct holds configuration details for how an input tile is
-/// loaded from global memory into LDS, including thread clustering, memory
-/// access patterns, and vectorization settings.
-struct InputTileTransferParams
-{
-    int k1; ///< The inner K dimension size, often matching the vectorization width.
-    std::array<int, 3>
-        thread_cluster_dims; ///< Spatial thread distribution over the input data tile; defines how
-                             ///< many threads are arranged on each axis.
-    std::array<int, 3> thread_cluster_order; ///< The order of thread spatial distribution over the
-                                             ///< input tensor dimensions.
-    std::array<int, 3> src_access_order; ///< The order of accessing input tensor axes (e.g., which
-                                         ///< dimension to read first).
-    int src_vector_dim; ///< The index of the axis on which vectorized memory access is performed
-                        ///< (the contiguous dimension).
-    int src_scalar_per_vector;    ///< The size of the vector access instruction; the number of
-                                  ///< elements accessed per thread per instruction.
-    int dst_scalar_per_vector_k1; ///< The size of the vectorized store into LDS memory along the K1
-                                  ///< dimension.
-    bool lds_padding; ///< Flag indicating if padding is used for the LDS tensor to prevent bank
-                      ///< conflicts.
-};
-
-/// @brief Complete information for an input tile transfer.
-/// @details Combines the dimensional information and transfer parameters for
-/// a full description of an input tile's journey from global memory to LDS.
-struct InputTileTransferInfo
-{
-    InputTileTransferDimensions tile_dimensions; ///< The shape and layout of the tile.
-    InputTileTransferParams transfer_params; ///< The parameters for the memory transfer operation.
-};
-
-/// @brief Parameters for the warp-level GEMM computation.
-/// @details Defines the configuration of the GEMM operation performed by each
-/// warp using hardware MFMA (Matrix Fused Multiply-Add) instructions.
-struct WarpGemmParams
-{
-    int gemm_m; ///< The M dimension of a single MFMA instruction (MPerXdl).
-    int gemm_n; ///< The N dimension of a single MFMA instruction (NPerXdl).
-    int m_iter; ///< The number of MFMA iterations along the M dimension of the output tile per
-                ///< wavefront (MXdlPerWave).
-    int n_iter; ///< The number of MFMA iterations along the N dimension of the output tile per
-                ///< wavefront (NXdlPerWave).
-};
-
-/// @brief Parameters for shuffling data between warps (CShuffle optimization).
-/// @details Configures how many MFMA instruction results are processed per
-/// wave in each iteration of the CShuffle routine.
-struct WarpShuffleParams
-{
-    int m_gemms_per_shuffle; ///< Number of MFMA results along the M dimension to process per wave
-                             ///< per shuffle iteration.
-    int n_gemms_per_shuffle; ///< Number of MFMA results along the N dimension to process per wave
-                             ///< per shuffle iteration.
-};
-
-/// @brief Information for the output tile transfer (CShuffle).
-/// @details Describes how the final computed tile (C matrix) is written out from
-/// LDS to global memory, including shuffling, thread clustering, and vectorization.
-struct OutputTileTransferInfo
-{
-    WarpShuffleParams shuffle_params; ///< Configuration for cross-warp data shuffling.
-    // m_block, m_wave_per_xdl, n_block, n_wave_per_xdl
-    std::array<int, 4> thread_cluster_dims; ///< The spatial thread distribution used for storing
-                                            ///< data into the output tensor.
-    int scalar_per_vector; ///< The size of the vectorized memory access when storing data to the
-                           ///< output tensor.
-};
-
 // Helper metafunctions to derive signature information from Instance types
 
+/// @brief Helper function to report unsupported convolution direction with a clear error message.
+template <typename Instance>
+consteval void report_unsupported_conv_direction_error()
+{
+    throw "Unsupported convolution direction detected!\n"
+          "The kernel instance does not have a recognized convolution specialization.\n"
+          "Expected one of: kConvForwardSpecialization, kConvBwdDataSpecialization, or "
+          "kConvBwdWeightSpecialization.\n"
+          "Please verify that your kernel instance is properly configured.";
+}
+
 /// @brief Derives the convolution direction from a device kernel `Instance` type.
 /// @tparam Instance The device kernel instance type.
 /// @return A `builder::ConvDirection` enum value (FORWARD, BACKWARD_DATA, or BACKWARD_WEIGHT).
@@ -273,7 +189,10 @@ constexpr builder::ConvDirection conv_direction()
     else if constexpr(requires { &InstTraits::kConvBwdWeightSpecialization; })
         return builder::ConvDirection::BACKWARD_WEIGHT;
     else
-        return builder::ConvDirection::FORWARD; // Default fallback
+    {
+        report_unsupported_conv_direction_error<Instance>();
+        return builder::ConvDirection::FORWARD; // Unreachable
+    }
 }
 
 /// @brief Derives the convolution-specific specialization from a device kernel `Instance` type.
@@ -296,6 +215,7 @@ constexpr auto conv_spec()
         case Filter1x1Pad0: return FILTER_1X1_PAD0;
         case Filter1x1Stride1Pad0: return FILTER_1X1_STRIDE1_PAD0;
         case Filter3x3: return FILTER_3x3;
+        case OddC: return ODD_C;
         }
     }
     else if constexpr(requires { InstTraits::kConvBwdDataSpecialization; })
@@ -334,6 +254,20 @@ template <typename A,
 inline constexpr bool layouts_are =
     std::is_same_v<A, ExpectedA> && std::is_same_v<B, ExpectedB> && std::is_same_v<E, ExpectedE>;
 
+/// @brief Helper function to report unsupported layout combinations with a clear error message.
+/// @details This consteval function is designed to fail at compile time with a descriptive
+/// error message when an unsupported layout combination is encountered.
+template <typename A, typename B, typename E, int SpatialDim>
+consteval void report_unsupported_layout_error()
+{
+    // This will produce a compile-time error with the exception message
+    throw "Unsupported convolution layout combination detected!\n"
+          "The combination of ALayout, BLayout, and ELayout template parameters\n"
+          "is not recognized for the given spatial dimension.\n"
+          "Please verify that your convolution instance uses a supported layout configuration.\n"
+          "Check the conv_layout() function for the list of supported layout combinations.";
+}
+
 /// @brief Derives the grouped convolution layout from a device kernel `Instance` type.
 /// @tparam Instance The device kernel instance type.
 /// @return An std::array corresponding to the tensor layouts:
@@ -358,6 +292,8 @@ constexpr auto conv_layout()
     case 1:
         if constexpr(layouts_are<A, B, E, ctl::GNWC, ctl::GKXC, ctl::GNWK>)
             return layouts(GNWC, GKXC, GNWK);
+        if constexpr(layouts_are<A, B, E, ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>)
+            return layouts(GNWC, GKXC, GNWK);
         if constexpr(layouts_are<A, B, E, ctl::NWGC, ctl::GKXC, ctl::NWGK>)
             return layouts(NWGC, GKXC, NWGK);
         if constexpr(layouts_are<A, B, E, ctl::NGCW, ctl::GKXC, ctl::NGKW>)
@@ -368,8 +304,12 @@ constexpr auto conv_layout()
     case 2:
         if constexpr(layouts_are<A, B, E, ctl::GNHWC, ctl::GKYXC, ctl::GNHWK>)
             return layouts(GNHWC, GKYXC, GNHWK);
+        if constexpr(layouts_are<A, B, E, ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>)
+            return layouts(GNHWC, GKYXC, GNHWK);
         if constexpr(layouts_are<A, B, E, ctl::NHWGC, ctl::GKYXC, ctl::NHWGK>)
             return layouts(NHWGC, GKYXC, NHWGK);
+        if constexpr(layouts_are<A, B, E, ctl::NHWGC, ctl::KYXGC, ctl::NHWGK>)
+            return layouts(NHWGC, GKYXC, NHWGK);
         if constexpr(layouts_are<A, B, E, ctl::NGCHW, ctl::GKYXC, ctl::NGKHW>)
             return layouts(NGCHW, GKYXC, NGKHW);
         if constexpr(layouts_are<A, B, E, ctl::NGCHW, ctl::GKCYX, ctl::NGKHW>)
@@ -378,6 +318,8 @@ constexpr auto conv_layout()
     case 3:
         if constexpr(layouts_are<A, B, E, ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK>)
             return layouts(GNDHWC, GKZYXC, GNDHWK);
+        if constexpr(layouts_are<A, B, E, ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>)
+            return layouts(GNDHWC, GKZYXC, GNDHWK);
         if constexpr(layouts_are<A, B, E, ctl::NDHWGC, ctl::GKZYXC, ctl::NDHWGK>)
             return layouts(NDHWGC, GKZYXC, NDHWGK);
         if constexpr(layouts_are<A, B, E, ctl::NGCDHW, ctl::GKZYXC, ctl::NGKDHW>)
@@ -386,11 +328,31 @@ constexpr auto conv_layout()
             return layouts(NGCDHW, GKCZYX, NGKDHW);
         break;
     }
+
+    // If we reach here, the layout combination is not supported
+    // Call consteval function to trigger a compile-time error with a clear message
+    report_unsupported_layout_error<A, B, E, InstanceTraits<Instance>::kSpatialDim>();
+
+    // This return is unreachable but needed to satisfy the compiler
+    return layouts(GNHWC, GKYXC, GNHWK);
+}
+
+/// @brief Helper function to report unsupported data type with a clear error message.
+template <typename ADataType>
+consteval void report_unsupported_data_type_error()
+{
+    throw "Unsupported data type detected!\n"
+          "The ADataType is not recognized.\n"
+          "Supported types are: ck::half_t (FP16), ck::Tuple<ck::half_t, ck::half_t> (FP16_FP16), "
+          "ck::bhalf_t (BF16), ck::Tuple<ck::bhalf_t, ck::bhalf_t> (BF16_BF16), float (FP32), "
+          "ck::Tuple<float, float> (FP32_FP32), double (FP64), ck::f8_t (FP8), ck::bf8_fnuz_t "
+          "(BF8), "
+          "int8_t (I8), ck::Tuple<int8_t, int8_t> (I8_I8), uint8_t (U8).\n"
+          "Please verify that your kernel instance uses a supported data type.";
 }
 
 /// @brief Derives the data type from a device kernel `Instance` type.
-/// @tparam Instance The device kernel instance type.
-/// @return A `builder::DataType` enum value (e.g., FP16, BF16, FP32).
+/// Returns a `builder::DataType` enum value (e.g., FP16, BF16, FP32, BF8).
 template <typename Instance>
 constexpr builder::DataType conv_data_type()
     requires HasDataTypes<InstanceTraits<Instance>>
@@ -401,18 +363,50 @@ constexpr builder::DataType conv_data_type()
 
     if constexpr(std::is_same_v<ADataType, ck::half_t>)
         return FP16;
+    else if constexpr(std::is_same_v<ADataType, ck::Tuple<ck::half_t, ck::half_t>>)
+        return FP16_FP16;
     else if constexpr(std::is_same_v<ADataType, ck::bhalf_t>)
         return BF16;
+    else if constexpr(std::is_same_v<ADataType, ck::Tuple<ck::bhalf_t, ck::bhalf_t>>)
+        return BF16_BF16;
     else if constexpr(std::is_same_v<ADataType, float>)
         return FP32;
+    else if constexpr(std::is_same_v<ADataType, ck::Tuple<float, float>>)
+        return FP32_FP32;
+    else if constexpr(std::is_same_v<ADataType, double>)
+        return FP64;
     else if constexpr(std::is_same_v<ADataType, ck::f8_t>)
         return FP8;
+    else if constexpr(std::is_same_v<ADataType, ck::bf8_fnuz_t>)
+        return BF8;
+    else if constexpr(std::is_same_v<ADataType, ck::bf8_ocp_t>)
+        return BF8;
     else if constexpr(std::is_same_v<ADataType, int8_t>)
         return I8;
+    else if constexpr(std::is_same_v<ADataType, ck::Tuple<int8_t, int8_t>>)
+        return I8_I8;
     else if constexpr(std::is_same_v<ADataType, uint8_t>)
         return U8;
     else
-        return FP32; // Default fallback
+    {
+        report_unsupported_data_type_error<ADataType>();
+        return FP32; // Unreachable
+    }
+}
+
+/// @brief Helper function to report unsupported elementwise operation with a clear error message.
+template <typename ElementwiseOp>
+consteval void report_unsupported_elementwise_op_error()
+{
+    throw "Unsupported elementwise operation detected!\n"
+          "The elementwise operation type is not recognized.\n"
+          "Supported operations are: AddClamp, AddReluAdd, BiasBnormClamp, Bilinear, "
+          "BiasNormalizeInInferClamp, Clamp, ConvInvscale, ConvScale, ConvScaleAdd, "
+          "ConvScaleRelu, Scale, ScaleAdd, PassThrough, ScaleAddScaleAddRelu, DynamicUnaryOp, "
+          "UnaryCombinedOp, Activation_Mul2_Clamp, Activation_Mul_Clamp, Add_Activation_Mul_Clamp, "
+          "Add_Activation_Mul2_Clamp, Add_Mul_Activation_Mul_Clamp, Add_Mul2_Activation_Mul_Clamp, "
+          "UnaryConvert.\n"
+          "Please verify that your kernel instance uses a supported elementwise operation.";
 }
 
 /// @brief Derives the elementwise operation from op type.
@@ -424,16 +418,83 @@ constexpr builder::ElementwiseOperation elementwise_op()
     using enum builder::ElementwiseOperation;
     constexpr std::string_view name = detail::elementwise_op_name<ElementwiseOp>();
 
-    if constexpr(detail::case_insensitive_equal(name, "BiasBnormClamp"))
+    if constexpr(detail::case_insensitive_equal(name, "AddClamp"))
+        return ADD_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "AddReluAdd"))
+        return ADD_RELU_ADD;
+    else if constexpr(detail::case_insensitive_equal(name, "BiasBnormClamp"))
         return BIAS_BNORM_CLAMP;
-    if constexpr(detail::case_insensitive_equal(name, "Clamp"))
+    else if constexpr(detail::case_insensitive_equal(name, "Bilinear"))
+        return BILINEAR;
+    else if constexpr(detail::case_insensitive_equal(name, "BiasNormalizeInInferClamp"))
+        return BIAS_BNORM_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "Clamp"))
         return CLAMP;
-    if constexpr(detail::case_insensitive_equal(name, "Scale"))
+    else if constexpr(detail::case_insensitive_equal(name, "ConvInvscale"))
+        return CONV_INVSCALE;
+    else if constexpr(detail::case_insensitive_equal(name, "ConvScale"))
+        return CONV_SCALE;
+    else if constexpr(detail::case_insensitive_equal(name, "ConvScaleAdd"))
+        return CONV_SCALE_ADD;
+    else if constexpr(detail::case_insensitive_equal(name, "ConvScaleRelu"))
+        return CONV_SCALE_RELU;
+    else if constexpr(detail::case_insensitive_equal(name, "Scale"))
         return SCALE;
-    if constexpr(detail::case_insensitive_equal(name, "PassThrough"))
+    else if constexpr(detail::case_insensitive_equal(name, "ScaleAdd"))
+        return SCALE_ADD;
+    else if constexpr(detail::case_insensitive_equal(name, "PassThrough"))
         return PASS_THROUGH;
-    if constexpr(detail::case_insensitive_equal(name, "ScaleAddScaleAddRelu"))
+    else if constexpr(detail::case_insensitive_equal(name, "ScaleAddScaleAddRelu"))
         return SCALEADD_SCALEADD_RELU;
+    else if constexpr(detail::case_insensitive_equal(name, "DynamicUnaryOp"))
+        return DYNAMIC_UNARY_OP;
+    else if constexpr(detail::case_insensitive_equal(name, "UnaryCombinedOp"))
+        return UNARY_COMBINED_OP;
+    else if constexpr(detail::case_insensitive_equal(name, "Activation_Mul2_Clamp"))
+        return ACTIVATION_MUL2_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "Activation_Mul_Clamp"))
+        return ACTIVATION_MUL_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "Add_Activation_Mul_Clamp"))
+        return ADD_ACTIVATION_MUL_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "Add_Activation_Mul2_Clamp"))
+        return ADD_ACTIVATION_MUL2_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "Add_Mul_Activation_Mul_Clamp"))
+        return ADD_MUL_ACTIVATION_MUL_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "Add_Mul2_Activation_Mul_Clamp"))
+        return ADD_MUL2_ACTIVATION_MUL_CLAMP;
+    else if constexpr(detail::case_insensitive_equal(name, "UnaryConvert"))
+        return UNARY_CONVERT;
+    else if constexpr(detail::case_insensitive_equal(name, "Logistic"))
+        return LOGISTIC;
+    else if constexpr(detail::case_insensitive_equal(name, "ClippedRelu"))
+        return CLIPPED_RELU;
+    else if constexpr(detail::case_insensitive_equal(name, "Swish"))
+        return SWISH;
+    else if constexpr(detail::case_insensitive_equal(name, "Elu"))
+        return ELU;
+    else if constexpr(detail::case_insensitive_equal(name, "Power"))
+        return POWER;
+    else if constexpr(detail::case_insensitive_equal(name, "LeakyRelu"))
+        return LEAKY_RELU;
+    else if constexpr(detail::case_insensitive_equal(name, "UnaryAbs"))
+        return UNARY_ABS;
+    else if constexpr(detail::case_insensitive_equal(name, "Relu"))
+        return RELU;
+    else if constexpr(detail::case_insensitive_equal(name, "SoftRelu"))
+        return SOFT_RELU;
+    else if constexpr(detail::case_insensitive_equal(name, "Sigmoid"))
+        return SIGMOID;
+    else if constexpr(detail::case_insensitive_equal(name, "TanH"))
+        return TANH;
+    else if constexpr(detail::case_insensitive_equal(name, "Gelu"))
+        return GELU;
+    else if constexpr(detail::case_insensitive_equal(name, "Silu"))
+        return SILU;
+    else
+    {
+        report_unsupported_elementwise_op_error<ElementwiseOp>();
+        return PASS_THROUGH; // Unreachable
+    }
 }
 
 /// @brief Derives a gemm padding from a kernel instance type.
@@ -606,45 +667,4 @@ struct ConvTraits<Instance>
     static constexpr auto pipeline_scheduler = get_pipeline_scheduler();
 };
 
-/// @brief Specialization of `ConvTraits` for a `ConvBuilder` type.
-/// @details This specialization provides backward compatibility for reflecting
-/// on kernels defined via the `ConvBuilder` interface. It works by first
-/// creating the `Instance` via the builder, and then delegating
-/// all trait extraction to the `ConvTraits<Instance>` specialization.
-template <builder::ConvSignatureDescriptor auto SIGNATURE,
-          builder::ConvAlgorithmDescriptor auto ALGORITHM,
-          builder::StringLiteral VERSION>
-struct ConvTraits<builder::ConvBuilder<SIGNATURE, ALGORITHM, VERSION>>
-{
-    using Instance = typename builder::ConvBuilder<SIGNATURE, ALGORITHM, VERSION>::Instance;
-
-    // Delegate to Instance-based ConvTraits
-    using InstanceConvTraits = ConvTraits<Instance>;
-
-    // Forward all members from Instance-based traits
-    static constexpr int spatial_dim                  = InstanceConvTraits::spatial_dim;
-    static constexpr builder::ConvDirection direction = InstanceConvTraits::direction;
-    static constexpr auto layout                      = InstanceConvTraits::layout;
-    static constexpr builder::DataType data_type      = InstanceConvTraits::data_type;
-
-    static constexpr builder::ElementwiseOperation input_element_op =
-        InstanceConvTraits::input_element_op;
-    static constexpr builder::ElementwiseOperation weight_element_op =
-        InstanceConvTraits::weight_element_op;
-    static constexpr builder::ElementwiseOperation output_element_op =
-        InstanceConvTraits::output_element_op;
-
-    static constexpr auto gemm_padding        = InstanceConvTraits::gemm_padding;
-    static constexpr auto conv_specialization = InstanceConvTraits::conv_specialization;
-
-    static constexpr int thread_block_size                  = InstanceConvTraits::thread_block_size;
-    static constexpr DataTileInfo tile_dims                 = InstanceConvTraits::tile_dims;
-    static constexpr InputTileTransferInfo a_tile_transfer  = InstanceConvTraits::a_tile_transfer;
-    static constexpr InputTileTransferInfo b_tile_transfer  = InstanceConvTraits::b_tile_transfer;
-    static constexpr WarpGemmParams warp_gemm               = InstanceConvTraits::warp_gemm;
-    static constexpr OutputTileTransferInfo c_tile_transfer = InstanceConvTraits::c_tile_transfer;
-    static constexpr auto pipeline_version                  = InstanceConvTraits::pipeline_version;
-    static constexpr auto pipeline_scheduler = InstanceConvTraits::pipeline_scheduler;
-};
-
 } // namespace ck_tile::reflect::conv
diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_types.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_types.hpp
new file mode 100644
index 0000000000..bb98455617
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_types.hpp
@@ -0,0 +1,109 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// @file
+/// @brief Type definitions for convolution reflection
+///
+/// This file contains the type definitions used by both conv_traits.hpp and conv_description.hpp
+/// to avoid circular dependencies.
+
+#pragma once
+
+#include <array>
+
+namespace ck_tile::reflect::conv {
+
+/// @brief Data tile dimensions processed by a workgroup.
+/// @details This struct defines the M, N, and K dimensions of the data tile
+/// that a single workgroup (thread block) is responsible for processing in the
+/// underlying GEMM computation.
+struct DataTileInfo
+{
+    int m; ///< M dimension of the tile processed by the workgroup (MPerBlock).
+    int n; ///< N dimension of the tile processed by the workgroup (NPerBlock).
+    int k; ///< K dimension of the tile processed by the workgroup (KPerBlock).
+};
+
+/// @brief Dimensions for an input data tile transfer.
+/// @details Defines the shape of the input tile (A or B matrix) as it is
+/// transferred from global memory to LDS. The tile is conceptually divided
+/// into k0 and k1 dimensions.
+struct InputTileTransferDimensions
+{
+    int k0;     ///< The outer dimension of K, where K = k0 * k1.
+    int m_or_n; ///< The M dimension for the A matrix transfer, or the N dimension for the B matrix.
+    int k1; ///< The inner dimension of K, often corresponding to the vector load size from global
+            ///< memory.
+};
+
+/// @brief Parameters governing the transfer of an input tile.
+/// @details This struct holds configuration details for how an input tile is
+/// loaded from global memory into LDS, including thread clustering, memory
+/// access patterns, and vectorization settings.
+struct InputTileTransferParams
+{
+    int k1; ///< The inner K dimension size, often matching the vectorization width.
+    std::array<int, 3>
+        thread_cluster_dims; ///< Spatial thread distribution over the input data tile; defines how
+                             ///< many threads are arranged on each axis.
+    std::array<int, 3> thread_cluster_order; ///< The order of thread spatial distribution over the
+                                             ///< input tensor dimensions.
+    std::array<int, 3> src_access_order; ///< The order of accessing input tensor axes (e.g., which
+                                         ///< dimension to read first).
+    int src_vector_dim; ///< The index of the axis on which vectorized memory access is performed
+                        ///< (the contiguous dimension).
+    int src_scalar_per_vector;    ///< The size of the vector access instruction; the number of
+                                  ///< elements accessed per thread per instruction.
+    int dst_scalar_per_vector_k1; ///< The size of the vectorized store into LDS memory along the K1
+                                  ///< dimension.
+    bool lds_padding; ///< Flag indicating if padding is used for the LDS tensor to prevent bank
+                      ///< conflicts.
+};
+
+/// @brief Complete information for an input tile transfer.
+/// @details Combines the dimensional information and transfer parameters for
+/// a full description of an input tile's journey from global memory to LDS.
+struct InputTileTransferInfo
+{
+    InputTileTransferDimensions tile_dimensions; ///< The shape and layout of the tile.
+    InputTileTransferParams transfer_params; ///< The parameters for the memory transfer operation.
+};
+
+/// @brief Parameters for the warp-level GEMM computation.
+/// @details Defines the configuration of the GEMM operation performed by each
+/// warp using hardware MFMA (Matrix Fused Multiply-Add) instructions.
+struct WarpGemmParams
+{
+    int gemm_m; ///< The M dimension of a single MFMA instruction (MPerXdl).
+    int gemm_n; ///< The N dimension of a single MFMA instruction (NPerXdl).
+    int m_iter; ///< The number of MFMA iterations along the M dimension of the output tile per
+                ///< wavefront (MXdlPerWave).
+    int n_iter; ///< The number of MFMA iterations along the N dimension of the output tile per
+                ///< wavefront (NXdlPerWave).
+};
+
+/// @brief Parameters for shuffling data between warps (CShuffle optimization).
+/// @details Configures how many MFMA instruction results are processed per
+/// wave in each iteration of the CShuffle routine.
+struct WarpShuffleParams
+{
+    int m_gemms_per_shuffle; ///< Number of MFMA results along the M dimension to process per wave
+                             ///< per shuffle iteration.
+    int n_gemms_per_shuffle; ///< Number of MFMA results along the N dimension to process per wave
+                             ///< per shuffle iteration.
+};
+
+/// @brief Information for the output tile transfer (CShuffle).
+/// @details Describes how the final computed tile (C matrix) is written out from
+/// LDS to global memory, including shuffling, thread clustering, and vectorization.
+struct OutputTileTransferInfo
+{
+    WarpShuffleParams shuffle_params; ///< Configuration for cross-warp data shuffling.
+    // m_block, m_wave_per_xdl, n_block, n_wave_per_xdl
+    std::array<int, 4> thread_cluster_dims; ///< The spatial thread distribution used for storing
+                                            ///< data into the output tensor.
+    int scalar_per_vector; ///< The size of the vectorized memory access when storing data to the
+                           ///< output tensor.
+};
+
+} // namespace ck_tile::reflect::conv
diff --git a/experimental/builder/include/ck_tile/builder/reflect/description.hpp b/experimental/builder/include/ck_tile/builder/reflect/description.hpp
index c3a38964a7..6a7b2513be 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/description.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/description.hpp
@@ -20,6 +20,11 @@ namespace ck_tile::reflect {
 class Description
 {
     public:
+    Description()                              = default;
+    Description(const Description&)            = default;
+    Description(Description&&)                 = default;
+    Description& operator=(const Description&) = default;
+    Description& operator=(Description&&)      = default;
     /// @brief Virtual destructor for proper cleanup of derived classes
     virtual ~Description() = default;
 
@@ -36,4 +41,30 @@ class Description
     virtual std::string instance_string() const = 0;
 };
 
+/// @brief A specialized Description that only supports instance_string()
+/// This is a helper class for kernels that don't yet have full ConvDescription support.
+/// The brief() and detailed() methods return "not supported" placeholders.
+class InstanceStringDescription : public Description
+{
+    public:
+    /// @brief Construct with an instance string
+    /// @param instance The instance string to store
+    explicit InstanceStringDescription(std::string instance) : instance_(std::move(instance)) {}
+
+    /// @brief Returns "not supported" as brief descriptions are not implemented
+    /// @return A placeholder string indicating the feature is not supported
+    std::string brief() const override { return "not supported"; }
+
+    /// @brief Returns "not supported" as detailed descriptions are not implemented
+    /// @return A placeholder string indicating the feature is not supported
+    std::string detailed() const override { return "not supported"; }
+
+    /// @brief Returns the stored instance string
+    /// @return The instance string provided during construction
+    std::string instance_string() const override { return instance_; }
+
+    private:
+    std::string instance_; ///< The stored instance string
+};
+
 } // namespace ck_tile::reflect
diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp
index fb732df55e..f7386720b3 100644
--- a/experimental/builder/include/ck_tile/builder/types.hpp
+++ b/experimental/builder/include/ck_tile/builder/types.hpp
@@ -11,15 +11,22 @@
 
 namespace ck_tile::builder {
 
+// TODO: Handle tuple types and FP8/BF8 properly
 enum class DataType
 {
     UNDEFINED_DATA_TYPE = 0,
     FP32,
+    FP32_FP32,
     FP16,
+    FP16_FP16,
     BF16,
+    BF16_BF16,
     FP8,
+    BF8,
+    FP64,
     INT32,
     I8,
+    I8_I8,
     U8
 };
 
@@ -102,13 +109,44 @@ enum class ConvDirection
 };
 
 // Fused element-wise operations.
+// TODO: Generalize design rather than enumerating all possible ops.
 enum class ElementwiseOperation
 {
+    ADD_CLAMP,
+    ADD_RELU_ADD,
+    ACTIVATION_MUL2_CLAMP,
+    ACTIVATION_MUL_CLAMP,
+    ADD_ACTIVATION_MUL_CLAMP,
+    ADD_ACTIVATION_MUL2_CLAMP,
+    ADD_MUL_ACTIVATION_MUL_CLAMP,
+    ADD_MUL2_ACTIVATION_MUL_CLAMP,
     BIAS_BNORM_CLAMP,
+    BILINEAR,
     SCALE,
+    SCALE_ADD,
     CLAMP,
+    CONV_INVSCALE,
+    CONV_SCALE,
+    CONV_SCALE_ADD,
+    CONV_SCALE_RELU,
     PASS_THROUGH,
-    SCALEADD_SCALEADD_RELU
+    SCALEADD_SCALEADD_RELU,
+    DYNAMIC_UNARY_OP,
+    UNARY_COMBINED_OP,
+    UNARY_CONVERT,
+    LOGISTIC,
+    CLIPPED_RELU,
+    SWISH,
+    ELU,
+    POWER,
+    LEAKY_RELU,
+    UNARY_ABS,
+    RELU,
+    SOFT_RELU,
+    SIGMOID,
+    TANH,
+    GELU,
+    SILU
 };
 
 // Enums for pipeline versions & schedulers
@@ -160,7 +198,8 @@ enum class ConvFwdSpecialization
     DEFAULT,
     FILTER_1X1_PAD0,
     FILTER_1X1_STRIDE1_PAD0,
-    FILTER_3x3
+    FILTER_3x3,
+    ODD_C
 };
 
 // Enums for the backward data convolution specialization.
@@ -219,11 +258,17 @@ inline std::string_view toString(DataType dt)
     switch(dt)
     {
     case FP16: return "FP16";
+    case FP16_FP16: return "FP16_FP16";
     case FP32: return "FP32";
+    case FP32_FP32: return "FP32_FP32";
     case BF16: return "BF16";
+    case BF16_BF16: return "BF16_BF16";
     case FP8: return "FP8";
+    case BF8: return "BF8";
+    case FP64: return "FP64";
     case INT32: return "INT32";
     case I8: return "I8";
+    case I8_I8: return "I8_I8";
     case U8: return "U8";
     case UNDEFINED_DATA_TYPE: return "UNDEFINED_DATA_TYPE";
     default: return "Unknown";
@@ -247,11 +292,41 @@ inline std::string_view toString(ElementwiseOperation op)
     using enum ElementwiseOperation;
     switch(op)
     {
+    case ADD_CLAMP: return "ADD_CLAMP";
+    case ADD_RELU_ADD: return "ADD_RELU_ADD";
+    case ACTIVATION_MUL2_CLAMP: return "ACTIVATION_MUL2_CLAMP";
+    case ACTIVATION_MUL_CLAMP: return "ACTIVATION_MUL_CLAMP";
+    case ADD_ACTIVATION_MUL_CLAMP: return "ADD_ACTIVATION_MUL_CLAMP";
+    case ADD_ACTIVATION_MUL2_CLAMP: return "ADD_ACTIVATION_MUL2_CLAMP";
+    case ADD_MUL_ACTIVATION_MUL_CLAMP: return "ADD_MUL_ACTIVATION_MUL_CLAMP";
+    case ADD_MUL2_ACTIVATION_MUL_CLAMP: return "ADD_MUL2_ACTIVATION_MUL_CLAMP";
+    case BIAS_BNORM_CLAMP: return "BIAS_BNORM_CLAMP";
+    case BILINEAR: return "BILINEAR";
     case CLAMP: return "CLAMP";
     case SCALE: return "SCALE";
+    case SCALE_ADD: return "SCALE_ADD";
+    case CONV_INVSCALE: return "CONV_INVSCALE";
+    case CONV_SCALE: return "CONV_SCALE";
+    case CONV_SCALE_ADD: return "CONV_SCALE_ADD";
+    case CONV_SCALE_RELU: return "CONV_SCALE_RELU";
     case PASS_THROUGH: return "PASS_THROUGH";
-    case BIAS_BNORM_CLAMP: return "BIAS_BNORM_CLAMP";
     case SCALEADD_SCALEADD_RELU: return "SCALEADD_SCALEADD_RELU";
+    case DYNAMIC_UNARY_OP: return "DYNAMIC_UNARY_OP";
+    case UNARY_COMBINED_OP: return "UNARY_COMBINED_OP";
+    case UNARY_CONVERT: return "UNARY_CONVERT";
+    case LOGISTIC: return "LOGISTIC";
+    case CLIPPED_RELU: return "CLIPPED_RELU";
+    case SWISH: return "SWISH";
+    case ELU: return "ELU";
+    case POWER: return "POWER";
+    case LEAKY_RELU: return "LEAKY_RELU";
+    case UNARY_ABS: return "UNARY_ABS";
+    case RELU: return "RELU";
+    case SOFT_RELU: return "SOFT_RELU";
+    case SIGMOID: return "SIGMOID";
+    case TANH: return "TANH";
+    case GELU: return "GELU";
+    case SILU: return "SILU";
     default: return "Unknown";
     }
 }
@@ -305,6 +380,7 @@ inline std::string_view toString(ConvFwdSpecialization spec)
     case FILTER_1X1_PAD0: return "FILTER_1X1_PAD0";
     case FILTER_1X1_STRIDE1_PAD0: return "FILTER_1X1_STRIDE1_PAD0";
     case FILTER_3x3: return "FILTER_3x3";
+    case ODD_C: return "ODD_C";
     default: return "Unknown";
     }
 }
diff --git a/experimental/builder/test/conv/ck/test_conv_traits.cpp b/experimental/builder/test/conv/ck/test_conv_traits.cpp
index d052aba548..d5661ad67b 100644
--- a/experimental/builder/test/conv/ck/test_conv_traits.cpp
+++ b/experimental/builder/test/conv/ck/test_conv_traits.cpp
@@ -5,6 +5,7 @@
 #include <gmock/gmock.h>
 #include <concepts>
 
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck_tile/builder/reflect/conv_traits.hpp>
 #include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp>
 #include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp>
diff --git a/experimental/builder/test/test_conv_description.cpp b/experimental/builder/test/test_conv_description.cpp
index ace9ce0239..158cb2668f 100644
--- a/experimental/builder/test/test_conv_description.cpp
+++ b/experimental/builder/test/test_conv_description.cpp
@@ -4,8 +4,9 @@
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
 
-#include <ck_tile/builder/conv_builder.hpp>
-#include <ck_tile/builder/reflect/conv_description.hpp>
+#include "ck_tile/builder/conv_builder.hpp"
+#include "ck_tile/builder/reflect/conv_description.hpp"
+#include "ck_tile/builder/reflect/conv_describe.hpp"
 #include "testing_utils.hpp"
 #include "impl/conv_signature_types.hpp"
 #include "impl/conv_algorithm_types.hpp"
diff --git a/experimental/builder/test/test_instance_string_bwd_weight_grp_conv_xdl.cpp b/experimental/builder/test/test_instance_string_bwd_weight_grp_conv_xdl.cpp
index 88a57a3735..38e79a2eb5 100644
--- a/experimental/builder/test/test_instance_string_bwd_weight_grp_conv_xdl.cpp
+++ b/experimental/builder/test/test_instance_string_bwd_weight_grp_conv_xdl.cpp
@@ -72,14 +72,16 @@ std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
                            ",1"             // MaxTransposeTransferSrcScalarPerVector
                            ",1>";           // MaxTransposeTransferDstScalarPerVector
 
-// Test GetInstanceString through base class pointer for backward weight XDL variant
-TEST(InstanceString, GetInstanceStringReturnsCorrectValueForBwdWeightGrpConvXdl)
+// Test describe() through base class pointer for backward weight XDL variant
+TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvXdl)
 {
     using BaseClass = ck::tensor_operation::device::BaseOperator;
     DeviceInstance device_instance;
     BaseClass* base_ptr = &device_instance;
 
-    EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
+    auto desc = base_ptr->describe();
+    ASSERT_NE(desc, nullptr);
+    EXPECT_EQ(desc->instance_string(), expected_str);
 }
 
 // TODO: Add DescriptionReturnsCorrectValueForBwdWeightGrpConvXdl test once ckr::describe supports
diff --git a/experimental/builder/test/test_instance_string_fwd_grp_conv.cpp b/experimental/builder/test/test_instance_string_fwd_grp_conv.cpp
index 35f3db1469..bb67e18087 100644
--- a/experimental/builder/test/test_instance_string_fwd_grp_conv.cpp
+++ b/experimental/builder/test/test_instance_string_fwd_grp_conv.cpp
@@ -2,10 +2,11 @@
 // SPDX-License-Identifier: MIT
 
 #include "gtest/gtest.h"
-#include "ck_tile/builder/reflect/instance_traits.hpp"
-#include "ck_tile/builder/reflect/conv_description.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include <ck_tile/builder/reflect/conv_describe.hpp>
+#include <ck_tile/builder/reflect/conv_description.hpp>
+#include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp>
+#include <ck/tensor_operation/gpu/device/device_base.hpp>
 
 namespace {
 
@@ -77,14 +78,16 @@ std::string expected_str = "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle"
                            ",Default"       // LoopScheduler
                            ",1>";           // NumGroupsToMerge
 
-// Test GetInstanceString through base class pointer for standard XDL variant
-TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConv)
+// Test describe() through base class pointer for standard XDL variant
+TEST(InstanceString, DescribeReturnsCorrectValueForFwdGrpConv)
 {
     using BaseClass = ck::tensor_operation::device::BaseOperator;
     DeviceInstance device_instance;
     BaseClass* base_ptr = &device_instance;
 
-    EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
+    auto desc = base_ptr->describe();
+    ASSERT_NE(desc, nullptr);
+    EXPECT_EQ(desc->instance_string(), expected_str);
 }
 
 TEST(InstanceString, DescriptionReturnsCorrectValueForFwdGrpConv)
diff --git a/experimental/builder/test/test_instance_string_fwd_grp_conv_dl.cpp b/experimental/builder/test/test_instance_string_fwd_grp_conv_dl.cpp
index 4f018cca11..cc585342c6 100644
--- a/experimental/builder/test/test_instance_string_fwd_grp_conv_dl.cpp
+++ b/experimental/builder/test/test_instance_string_fwd_grp_conv_dl.cpp
@@ -71,14 +71,16 @@ std::string expected_str = "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK"
                            ",5"                // CThreadTransferSrcDstVectorDim
                            ",1>";              // CThreadTransferDstScalarPerVector
 
-// Test GetInstanceString through base class pointer for DL variant
-TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvDl)
+// Test describe() through base class pointer for DL variant
+TEST(InstanceString, DescribeReturnsCorrectValueForFwdGrpConvDl)
 {
     using BaseClass = ck::tensor_operation::device::BaseOperator;
     DeviceInstance device_instance;
     BaseClass* base_ptr = &device_instance;
 
-    EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
+    auto desc = base_ptr->describe();
+    ASSERT_NE(desc, nullptr);
+    EXPECT_EQ(desc->instance_string(), expected_str);
 }
 
 // TODO: Add DescriptionReturnsCorrectValueForFwdGrpConvDl test once ckr::describe supports DL
diff --git a/experimental/builder/test/test_instance_string_fwd_grp_conv_large_tensor.cpp b/experimental/builder/test/test_instance_string_fwd_grp_conv_large_tensor.cpp
index 26b50bea6d..2f4e15e35c 100644
--- a/experimental/builder/test/test_instance_string_fwd_grp_conv_large_tensor.cpp
+++ b/experimental/builder/test/test_instance_string_fwd_grp_conv_large_tensor.cpp
@@ -2,10 +2,11 @@
 // SPDX-License-Identifier: MIT
 
 #include <gtest/gtest.h>
-#include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck_tile/builder/reflect/conv_describe.hpp>
 #include <ck_tile/builder/reflect/conv_description.hpp>
-#include <ck/tensor_operation/gpu/device/device_base.hpp>
+#include <ck_tile/builder/reflect/instance_traits.hpp>
 #include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp>
+#include <ck/tensor_operation/gpu/device/device_base.hpp>
 
 namespace {
 
@@ -76,14 +77,16 @@ std::string expected_str = "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Ten
                            ",fp16"          // BComputeDataType
                            ",Default>";     // LoopScheduler
 
-// Test GetInstanceString through base class pointer for large tensor variant
-TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvLargeTensor)
+// Test describe() through base class pointer for large tensor variant
+TEST(InstanceString, DescribeReturnsCorrectValueForFwdGrpConvLargeTensor)
 {
     using BaseClass = ck::tensor_operation::device::BaseOperator;
     DeviceInstance device_instance;
     BaseClass* base_ptr = &device_instance;
 
-    EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
+    auto desc = base_ptr->describe();
+    ASSERT_NE(desc, nullptr);
+    EXPECT_EQ(desc->instance_string(), expected_str);
 }
 
 TEST(InstanceString, DescriptionReturnsCorrectValueForFwdGrpConvLargeTensor)
diff --git a/experimental/builder/test/test_instance_string_fwd_grp_conv_v3.cpp b/experimental/builder/test/test_instance_string_fwd_grp_conv_v3.cpp
index 604667dd10..ccfa4c7197 100644
--- a/experimental/builder/test/test_instance_string_fwd_grp_conv_v3.cpp
+++ b/experimental/builder/test/test_instance_string_fwd_grp_conv_v3.cpp
@@ -3,6 +3,7 @@
 
 #include <gtest/gtest.h>
 #include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck_tile/builder/reflect/conv_describe.hpp>
 #include <ck_tile/builder/reflect/conv_description.hpp>
 #include <ck/tensor_operation/gpu/device/device_base.hpp>
 #include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp>
@@ -78,14 +79,16 @@ std::string expected_str = "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
                            ",fp16"          // BComputeDataType
                            ",false>";       // DirectLoad
 
-// Test GetInstanceString through base class pointer for V3 variant
-TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvV3)
+// Test describe() through base class pointer for V3 variant
+TEST(InstanceString, DescribeReturnsCorrectValueForFwdGrpConvV3)
 {
     using BaseClass = ck::tensor_operation::device::BaseOperator;
     DeviceInstance device_instance;
     BaseClass* base_ptr = &device_instance;
 
-    EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
+    auto desc = base_ptr->describe();
+    ASSERT_NE(desc, nullptr);
+    EXPECT_EQ(desc->instance_string(), expected_str);
 }
 
 TEST(InstanceString, DescriptionReturnsCorrectValueForFwdGrpConvV3)
diff --git a/experimental/builder/test/test_instance_string_fwd_grp_conv_wmma.cpp b/experimental/builder/test/test_instance_string_fwd_grp_conv_wmma.cpp
index 717b770c52..1b82a37a1c 100644
--- a/experimental/builder/test/test_instance_string_fwd_grp_conv_wmma.cpp
+++ b/experimental/builder/test/test_instance_string_fwd_grp_conv_wmma.cpp
@@ -76,14 +76,16 @@ std::string expected_str = "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle"
                            ",Default"       // LoopSched
                            ",v1>";          // PipelineVer
 
-// Test GetInstanceString through base class pointer for WMMA variant
-TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvWmma)
+// Test describe() through base class pointer for WMMA variant
+TEST(InstanceString, DescribeReturnsCorrectValueForFwdGrpConvWmma)
 {
     using BaseClass = ck::tensor_operation::device::BaseOperator;
     DeviceInstance device_instance;
     BaseClass* base_ptr = &device_instance;
 
-    EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
+    auto desc = base_ptr->describe();
+    ASSERT_NE(desc, nullptr);
+    EXPECT_EQ(desc->instance_string(), expected_str);
 }
 
 // TODO: Add DescriptionReturnsCorrectValueForFwdGrpConvWmma test once ckr::describe supports WMMA
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index ec623db6f7..3e37aac86e 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -8,8 +8,13 @@
 #include <sstream>
 #include <regex>
 #include <optional>
+#include <memory>
 
 #include "ck/stream_config.hpp"
+
+#ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/description.hpp"
+#endif
 #endif
 #include "ck/utility/get_id.hpp"
 
@@ -227,6 +232,12 @@ struct BaseOperator
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
     virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
     virtual std::string GetTypeString() const { return ""; }
+
+#ifdef CK_EXPERIMENTAL_BUILDER
+    // Return a description object for this operator, or nullptr if not supported.
+    virtual std::unique_ptr<ck_tile::reflect::Description> describe() const { return nullptr; }
+#endif
+
     virtual std::string GetInstanceString() const { return ""; }
 
     virtual std::string GetTypeIdName() const { return typeid(*this).name(); }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index be5c6eba40..42ad21dafe 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -25,6 +25,7 @@
 #include "ck/host_utility/kernel_launch.hpp"
 
 #ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/description.hpp"
 #include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 #endif
 
@@ -1240,6 +1241,11 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                       "for the given template parameters.");
         return ck_tile::reflect::instance_string<DeviceOp>();
     }
+
+    std::unique_ptr<ck_tile::reflect::Description> describe() const override
+    {
+        return std::make_unique<ck_tile::reflect::InstanceStringDescription>(GetInstanceString());
+    }
 #endif
 
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 347ea25e62..b5ca71d1fa 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -25,6 +25,7 @@
 #include "ck/host_utility/io.hpp"
 
 #ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/description.hpp"
 #include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #endif
 
@@ -1064,6 +1065,11 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
                       "for the given template parameters.");
         return ck_tile::reflect::instance_string<DeviceOp>();
     }
+
+    std::unique_ptr<ck_tile::reflect::Description> describe() const override
+    {
+        return std::make_unique<ck_tile::reflect::InstanceStringDescription>(GetInstanceString());
+    }
 #endif
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index a9b0975050..5ed8da8d1b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -29,6 +29,7 @@
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
 #ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/conv_describe.hpp"
 #include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #endif
 
@@ -2080,6 +2081,21 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                       "for the given template parameters.");
         return ck_tile::reflect::instance_string<DeviceOp>();
     }
+
+    std::unique_ptr<ck_tile::reflect::Description> describe() const override
+    {
+        static_assert(ck_tile::reflect::conv::HasConvTraits<DeviceOp>,
+                      "ConvTraits specialization not found for this device operation. "
+                      "If you modified the template parameters of this class, ensure that "
+                      "the corresponding ConvTraits specialization in "
+                      "ck_tile/builder/reflect/conv_traits.hpp is updated to match, or that "
+                      "InstanceTraits in "
+                      "ck_tile/builder/reflect/"
+                      "instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp "
+                      "provides all required members for ConvTraits to work.");
+        return std::make_unique<ck_tile::reflect::conv::ConvDescription>(
+            ck_tile::reflect::describe<DeviceOp>());
+    }
 #endif
 
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index 380f94426f..e69a9caa9c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -29,6 +29,7 @@
 #include "ck/host_utility/flush_cache.hpp"
 #include "ck/host_utility/io.hpp"
 #ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/conv_describe.hpp"
 #include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp"
 #endif
 
@@ -2103,6 +2104,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                       "for the given template parameters.");
         return ck_tile::reflect::instance_string<DeviceOp>();
     }
+
+    std::unique_ptr<ck_tile::reflect::Description> describe() const override
+    {
+        return std::make_unique<ck_tile::reflect::conv::ConvDescription>(
+            ck_tile::reflect::describe<DeviceOp>());
+    }
 #endif
 
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
index a3391c55e8..32e444fe1f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -25,6 +25,7 @@
 #include "ck/host_utility/io.hpp"
 
 #ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/description.hpp"
 #include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
 #endif
 
@@ -1019,6 +1020,11 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
                       "for the given template parameters.");
         return ck_tile::reflect::instance_string<DeviceOp>();
     }
+
+    std::unique_ptr<ck_tile::reflect::Description> describe() const override
+    {
+        return std::make_unique<ck_tile::reflect::InstanceStringDescription>(GetInstanceString());
+    }
 #endif
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index b51b78d6b9..b21af2abb0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -25,6 +25,7 @@
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
 #ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/conv_describe.hpp"
 #include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp"
 #endif
 
@@ -1238,6 +1239,22 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
             "for the given template parameters.");
         return ck_tile::reflect::instance_string<DeviceOp>();
     }
+
+    std::unique_ptr<ck_tile::reflect::Description> describe() const override
+    {
+        static_assert(
+            ck_tile::reflect::conv::HasConvTraits<DeviceOp>,
+            "ConvTraits specialization not found for this device operation. "
+            "If you modified the template parameters of this class, ensure that "
+            "the corresponding ConvTraits specialization in "
+            "ck_tile/builder/reflect/conv_traits.hpp is updated to match, or that "
+            "InstanceTraits in "
+            "ck_tile/builder/reflect/"
+            "instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp "
+            "provides all required members for ConvTraits to work.");
+        return std::make_unique<ck_tile::reflect::conv::ConvDescription>(
+            ck_tile::reflect::describe<DeviceOp>());
+    }
 #endif
 };
 

From 22b945e06ea4b4de188d7ff4ec7ae4bf127be9f9 Mon Sep 17 00:00:00 2001
From: Emily Martins <65371150+ecamartins@users.noreply.github.com>
Date: Sun, 14 Dec 2025 14:49:49 -0700
Subject: [PATCH 08/10] [CK_TILE] Stream-K Tree Reduction and Cache Skipping
 Integration (#3371)

* CK Tile Stream-K Tree Reduction

This change adds the first implementation of the Stream-K tree reduction
strategy into CK Tile. The tree reduction reduces the the number of
steps for accumulating results for a tile from O(N) to O(logN) where N
is the number of workgroups contributing to a C tile.

Additionally, in the original non-atomic reduction strategy, atomics
were used to set the flags buffer and to read from the flags buffer.
Howeover, through investigation with the tree reduciton, atomics with
default (relaxed) semantics were not enough to guarantee workgroups
would not read stale data, leading to incorrect results. Stronger
acquire/release memory orderings are too expensive. So, this change
also eliminates the use of atomics for setting the flags. Instead, we
leverage cache modifiers (e.g., GLC) to avoid writing to cache, thereby
avoiding the use of atomics.

Prelimiary tests were also added for the normal reduction and tree
reduction. More will be added in a future PR via tile engine.

* Move Stream-K kernel files to a subdirectory

* Cleanup Code Style & Handle Unsupported Reductions

This change makes the following small changes:
- Add an explicit else block for unimplemented reduction strategies
- Clarify type of sk_flags_ptr via auto*
- Add description for extra_iters_before_me variable

* Run new copyright script on new files
---
 include/ck_tile/ops/common/streamk_common.hpp |   5 +-
 include/ck_tile/ops/gemm.hpp                  |   7 +-
 .../streamk_gemm/streamk_gemm_coherency.hpp   |  35 ++++
 .../streamk_gemm_kernel.hpp                   | 195 +++++++++++++-----
 .../streamk_gemm_tile_partitioner.hpp         |  22 +-
 .../streamk_gemm_tile_partitioner_impl.hpp    |  36 +++-
 test/ck_tile/gemm_streamk/CMakeLists.txt      |   3 +
 .../test_gemm_streamk_fp16_reduction.cpp      |  17 ++
 .../test_gemm_streamk_reduction_cases.inc     |  88 ++++++++
 .../gemm_streamk/test_gemm_streamk_types.hpp  |   8 +
 .../gemm_streamk/test_gemm_streamk_util.hpp   |  31 ++-
 .../test_streamk_tile_partitioner.cpp         |  79 +++++++
 .../test_streamk_tile_partitioner_common.hpp  |  68 +++++-
 13 files changed, 524 insertions(+), 70 deletions(-)
 create mode 100644 include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_coherency.hpp
 rename include/ck_tile/ops/gemm/kernel/{ => streamk_gemm}/streamk_gemm_kernel.hpp (79%)
 rename include/ck_tile/ops/gemm/kernel/{ => streamk_gemm}/streamk_gemm_tile_partitioner.hpp (92%)
 rename include/ck_tile/ops/gemm/kernel/{ => streamk_gemm}/streamk_gemm_tile_partitioner_impl.hpp (88%)
 create mode 100644 test/ck_tile/gemm_streamk/smoke_tests/test_gemm_streamk_fp16_reduction.cpp
 create mode 100644 test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc

diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp
index c97282a8be..c723251112 100644
--- a/include/ck_tile/ops/common/streamk_common.hpp
+++ b/include/ck_tile/ops/common/streamk_common.hpp
@@ -8,7 +8,8 @@
 namespace ck_tile {
 enum StreamKReductionStrategy : uint32_t
 {
-    Atomic    = 0u,
-    Reduction = 1u
+    Atomic        = 0u,
+    Reduction     = 1u,
+    TreeReduction = 2u
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index d518a15b7e..0eaedbfb3a 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -33,9 +33,10 @@
 #include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
-#include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp"
-#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp"
-#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_coherency.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp"
 #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_coherency.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_coherency.hpp
new file mode 100644
index 0000000000..65e29c7fd5
--- /dev/null
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_coherency.hpp
@@ -0,0 +1,35 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include "ck_tile/core/arch/arch.hpp"
+namespace ck_tile {
+
+template <typename CompilerTarget, typename Enabler = void>
+struct StreamKCoherency
+{
+    static constexpr amd_buffer_coherence_enum BUFFER_COHERENCE =
+        amd_buffer_coherence_enum::coherence_default;
+};
+
+template <typename CompilerTarget>
+struct StreamKCoherency<CompilerTarget,
+                        core::arch::enable_if_target_id_t<CompilerTarget,
+                                                          core::arch::amdgcn_target_id::GFX942,
+                                                          core::arch::amdgcn_target_id::GFX950>>
+{
+    static constexpr amd_buffer_coherence_enum BUFFER_COHERENCE =
+        amd_buffer_coherence_enum::SYSTEM_NT0;
+};
+
+template <typename CompilerTarget>
+struct StreamKCoherency<CompilerTarget,
+                        core::arch::enable_if_target_id_t<CompilerTarget,
+                                                          core::arch::amdgcn_target_id::GFX908,
+                                                          core::arch::amdgcn_target_id::GFX90A>>
+{
+    static constexpr amd_buffer_coherence_enum BUFFER_COHERENCE =
+        amd_buffer_coherence_enum::glc_slc;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp
similarity index 79%
rename from include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
rename to include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp
index 6130107cfe..d1fd32dc1b 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/host/concat.hpp"
+#include "streamk_gemm_coherency.hpp"
 
 namespace ck_tile {
 
@@ -318,37 +319,58 @@ struct StreamKKernel
      * results.
      * @param kargs Kernel arguments, including the workspace pointer.
      * @param cta_idx The index of the current thread block (CTA).
-     * @note This function utilizes a workgroup barrier to set a synchronization flag for the given
-     * CTA index.
+     * @note This function utilizes a scalar store to write to the flags buffer.
      */
     CK_TILE_DEVICE void SignalStorePartialDone(const StreamKKernelArgs& kargs,
                                                index_t cta_idx) const
     {
-        auto sk_flags_ptr = static_cast<uint32_t*>(kargs.workspace_ptr);
-        workgroup_barrier sk_flags(sk_flags_ptr);
-        sk_flags.wait_set(0, 1, cta_idx);
+        auto* sk_flags_ptr = static_cast<index_t*>(kargs.workspace_ptr);
+        index_t offset     = cta_idx * sizeof(index_t);
+
+        asm volatile("s_mov_b32 m0, %2\n\t"
+                     // Depending on the architecture, the GLC flag will bypass the approproriate
+                     // cache level(s) to ensure the write is visible to other workgroups. See the
+                     // appropriate ISA for details about the GLC modifier.
+                     "s_store_dword %0, %1, %2 glc\n\t"
+                     "s_waitcnt lgkmcnt(0)" // Wait for the store to complete
+                     :
+                     : "s"(1), "s"(sk_flags_ptr), "s"(offset)
+                     : "memory");
     }
 
     /**
      * @brief Waits for the thread block (cta_idx) to complete storing its partial results.
      * @param kargs Kernel arguments, including the workspace pointer.
      * @param cta_idx The index of the thread block (CTA).
-     * @note This function utilizes a workgroup barrier to wait for the synchronization flag to be
-     * set by the given CTA index.
+     * @note This function utilizes a scalar load to read from the flags
+     * buffer.
      */
     CK_TILE_DEVICE void WaitStorePartialDone(const StreamKKernelArgs& kargs, index_t cta_idx) const
     {
-        auto sk_flags_ptr = static_cast<uint32_t*>(kargs.workspace_ptr);
-        workgroup_barrier sk_flags(sk_flags_ptr);
-        sk_flags.wait_eq(1, cta_idx);
+        auto* sk_flags_ptr = static_cast<index_t*>(kargs.workspace_ptr);
+        index_t result;
+        index_t offset = cta_idx * sizeof(index_t);
+
+        do
+        {
+            asm volatile("s_mov_b32 m0, %2\n\t"
+                         // Depending on the architecture, the GLC flag will bypass the
+                         // approproriate cache level(s) to avoid reading stale flags. See the
+                         // appropriate ISA for details about the GLC modifier.
+                         "s_load_dword %0, %1, %2 glc\n\t"
+                         "s_waitcnt lgkmcnt(0)" // Wait for the load to complete
+                         : "=s"(result)
+                         : "s"(sk_flags_ptr), "s"(offset)
+                         : "memory");
+        } while(result != 1);
     }
 
     /**
      * @brief Adds the values of a block tile to an output block tile.
      * @param in_out_block_tile The output block tile to which values are added.
      * @param in_block_tile The input block tile whose values are added.
-     * @note This function iterates over the distributed spans of the block tiles and updates the
-     * output block tile with accumulated values.
+     * @note This function iterates over the distributed spans of the block tiles and updates
+     * the output block tile with accumulated values.
      */
     template <typename OAccTile>
     CK_TILE_DEVICE void AddBlockTile(OAccTile& in_out_block_tile,
@@ -370,8 +392,8 @@ struct StreamKKernel
      * @param cta_idx The index of the thread block (CTA).
      * @param c_block_tile_dist The tile distribution for the block.
      * @return The loaded partial block tile.
-     * @note This function calculates the buffer pointer and uses the tile distribution for loading
-     * the partial block tile.
+     * @note This function calculates the buffer pointer and uses the tile distribution for
+     * loading the partial block tile.
      */
     template <typename DataType, typename OAccTileDist>
     CK_TILE_DEVICE auto LoadPartial(const StreamKKernelArgs& kargs,
@@ -405,8 +427,8 @@ struct StreamKKernel
      * @param kargs Kernel arguments, including the workspace pointer.
      * @param cta_idx The index of the thread block (CTA).
      * @param c_block_tile The block tile to be stored.
-     * @note This function calculates the buffer pointer and uses the tile window for storing the
-     * partial block tile.
+     * @note This function calculates the buffer pointer and uses the tile window for storing
+     * the partial block tile.
      */
     template <typename OAccTile>
     CK_TILE_DEVICE void StorePartial(const StreamKKernelArgs& kargs,
@@ -420,7 +442,10 @@ struct StreamKKernel
                                    kargs.tile_partitioner.get_flags_buffer_size() +
                                    cta_idx * c_block_tile_buffer_size;
 
-        const auto& partial_tensor_view = make_naive_tensor_view<address_space_enum::global>(
+        const auto& partial_tensor_view = make_naive_tensor_view<
+            address_space_enum::global,
+            memory_operation_enum::set,
+            StreamKCoherency<decltype(core::arch::get_compiler_target())>::BUFFER_COHERENCE>(
             static_cast<typename OAccTile::DataType*>(partial_buffer_ptr),
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
             make_tuple(TilePartitioner::NPerBlock, 1),
@@ -431,8 +456,11 @@ struct StreamKKernel
             partial_tensor_view,
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
             {0, 0});
-
         store_tile(partial_tile_window, c_block_tile);
+        // Wait for all vector stores for this wavefront to complete
+        s_waitcnt</*vmcnt*/ 0, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
+        // Wait for all wavefronts in this workgroup to arrive here before continuing
+        __builtin_amdgcn_s_barrier();
     }
 
     /**
@@ -483,7 +511,8 @@ struct StreamKKernel
             {
                 BaseGemm(kargs, tile_idx, num_loop_sk, i_k_a, i_k_b, k_size, smem_ptr_0);
             }
-            else
+            else if(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Reduction ||
+                    TilePartitioner::ReductionStrategy == StreamKReductionStrategy::TreeReduction)
             {
                 const auto c_macro_tile_idx =
                     kargs.tile_partitioner.get_output_tile_index(tile_idx);
@@ -528,46 +557,107 @@ struct StreamKKernel
 
                 auto tile_started = iter_start == tile_iter_start;
                 auto tile_ended   = iter_end >= tile_iter_end;
-                if(!tile_started)
+
+                if constexpr(TilePartitioner::ReductionStrategy ==
+                             StreamKReductionStrategy::Reduction)
                 {
-                    StorePartial(kargs, cta_idx, c_block_tile);
-                    // Ensure device-wide visibility of partial results stored in global memory
-                    // before signaling completion. __threadfence() guarantees that all global
-                    // memory writes by this thread are visible to other threads on the device.
-                    __threadfence(); // send signal when the store is done
-                    SignalStorePartialDone(kargs, cta_idx);
+                    if(!tile_started)
+                    {
+                        StorePartial(kargs, cta_idx, c_block_tile);
+                        SignalStorePartialDone(kargs, cta_idx);
+                    }
+                    else
+                    {
+                        auto accum_block_tile = c_block_tile;
+                        if(!tile_ended)
+                        {
+                            const index_t iter_per_tile =
+                                kargs.tile_partitioner.get_iters_per_tile();
+                            const index_t iter_per_cta =
+                                kargs.tile_partitioner.get_iters_per_sk_cta();
+                            const index_t extra_iters = kargs.tile_partitioner.get_extra_iters();
+                            int accum_iters           = local_iter_end - local_iter_start;
+                            int next_cta              = cta_idx + 1;
+
+                            while(accum_iters < iter_per_tile)
+                            {
+                                WaitStorePartialDone(kargs, next_cta);
+
+                                using BlockType = remove_cvref_t<decltype(c_block_tile)>;
+                                AddBlockTile(
+                                    accum_block_tile,
+                                    LoadPartial<typename BlockType::DataType>(
+                                        kargs, next_cta, c_block_tile.get_tile_distribution()));
+
+                                accum_iters += iter_per_cta + (next_cta < extra_iters);
+                                ++next_cta;
+                            }
+                        }
+
+                        auto& c_block_window = gemm_tile_windows.at(UniversalGemmKernel::I3);
+                        EpiloguePipeline{}(
+                            c_block_window, accum_block_tile, ds_block_window, smem_ptr_0);
+                    }
                 }
-                else
+                else // Tree Reduction
                 {
                     auto accum_block_tile = c_block_tile;
-                    if(!tile_ended)
+                    index_t tile_local_cta_idx =
+                        kargs.tile_partitioner.get_tile_local_cta_index(tile_iter_start, cta_idx);
+
+                    for(index_t stride = 1;; stride <<= 1)
                     {
-                        const index_t iter_per_tile = kargs.tile_partitioner.get_iters_per_tile();
-                        const index_t iter_per_cta  = kargs.tile_partitioner.get_iters_per_sk_cta();
-                        const index_t extra_iters   = kargs.tile_partitioner.get_extra_iters();
-                        int accum_iters             = local_iter_end - local_iter_start;
-                        int next_cta                = cta_idx + 1;
+                        const index_t partner_cta_idx = cta_idx + stride;
+                        const index_t partner_start_iter =
+                            kargs.tile_partitioner.get_start_iter(partner_cta_idx);
+                        bool partner_in_tile = partner_start_iter < tile_iter_end;
 
-                        while(accum_iters < iter_per_tile)
+                        // If the partner of the workgroup who started the tile is not in this tile,
+                        // then the work for this tile is done and results can be stored in the C
+                        // tensor.
+                        if(tile_started && !partner_in_tile)
                         {
-                            WaitStorePartialDone(kargs, next_cta);
+                            auto& c_block_window = gemm_tile_windows.at(UniversalGemmKernel::I3);
+                            EpiloguePipeline{}(
+                                c_block_window, accum_block_tile, ds_block_window, smem_ptr_0);
+                            break;
+                        }
 
-                            using BlockType = remove_cvref_t<decltype(c_block_tile)>;
-                            AddBlockTile(
-                                accum_block_tile,
-                                LoadPartial<typename BlockType::DataType>(
-                                    kargs, next_cta, c_block_tile.get_tile_distribution()));
-
-                            accum_iters += iter_per_cta + (next_cta < extra_iters);
-                            ++next_cta;
+                        // It's this workgroup's turn to read from partials.
+                        if(tile_local_cta_idx % (stride << 1) == 0)
+                        {
+                            // If this workgroup's partner is in the tile then it can read from
+                            // partials and accumulate results.
+                            if(partner_in_tile)
+                            {
+                                WaitStorePartialDone(kargs, partner_cta_idx);
+                                using BlockType = remove_cvref_t<decltype(c_block_tile)>;
+                                AddBlockTile(accum_block_tile,
+                                             LoadPartial<typename BlockType::DataType>(
+                                                 kargs,
+                                                 partner_cta_idx,
+                                                 c_block_tile.get_tile_distribution()));
+                            }
+                        }
+                        // Otherwise, it's this workgroup's turn to write to partials. All
+                        // workgroups, except the workgroup who starts the tile, will write to
+                        // partials.
+                        else
+                        {
+                            StorePartial(kargs, cta_idx, accum_block_tile);
+                            SignalStorePartialDone(kargs, cta_idx);
+                            // Once the workgroup writes to partials, it has no more work to do for
+                            // this tile.
+                            break;
                         }
                     }
-
-                    auto& c_block_window = gemm_tile_windows.at(UniversalGemmKernel::I3);
-                    EpiloguePipeline{}(
-                        c_block_window, accum_block_tile, ds_block_window, smem_ptr_0);
                 }
             }
+            else
+            {
+                static_assert(
+                    "An implementation does not exist for the chosen reduction strategy.");
+            }
 
             // Prepare for next Stream-K loop iteration.
             iter_start = tile_iter_end;
@@ -640,10 +730,10 @@ struct StreamKKernel
 
     private:
     /**
-     * @brief Computes the K offsets in the A and B tensors given iter_offset, where iter_offset is
-     * the starting macro tile index in the K dimension for the workgroup.
-     * @return A tuple containing the offsets into the A and B tensors accounting for the layouts
-     * of A and B.
+     * @brief Computes the K offsets in the A and B tensors given iter_offset, where iter_offset
+     * is the starting macro tile index in the K dimension for the workgroup.
+     * @return A tuple containing the offsets into the A and B tensors accounting for the
+     * layouts of A and B.
      * @note The default case is that A is assumed to be row major and B is assumed to be column
      * major.
      */
@@ -688,7 +778,8 @@ struct StreamKKernel
     }
 
     /**
-     * @brief Computes the occupancy (i.e. maximum number of active blocks per CU) for the kernel
+     * @brief Computes the occupancy (i.e. maximum number of active blocks per CU) for the
+     * kernel
      * @return The occupancy
      * @note This function queries the maximum occupancy of the kernel using
      * `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp
similarity index 92%
rename from include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp
rename to include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp
index 9ab75fbdbf..a6022e8b8e 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp
@@ -46,6 +46,16 @@ struct StreamKTilePartitionerBase
     CK_TILE_HOST_DEVICE index_t get_flags_buffer_size() const noexcept;
 
     public:
+    /**
+     * @brief Calculates the start iteration for the given the cta_idx.
+     * @param cta_idx     The current Stream-K workgroup's index.
+     * @return index_t    The start iteration.
+     * @note It is assumed that the first Stream-K workgroup has a `cta_idx` of zero. If a
+     * non-persistent DP section is used, then a Stream-K workgroup's `cta_idx` should be something
+     * like `blockIdx.x` minus number of DP workgroups.
+     */
+    CK_TILE_DEVICE index_t get_start_iter(index_t cta_idx) const noexcept;
+
     /**
      * @brief Calculates the start and end iteration given the cta_idx.
      *
@@ -107,7 +117,17 @@ struct StreamKTilePartitionerBase
     get_local_iter_end(index_t tile_iter_start, index_t iter_end, index_t tile_iter_end) noexcept;
 
     /**
-     * @brief Calculates the workgroups 2D tile index in the C tensor given the 1D tile index.
+     * @brief Calculates the workgroup's local CTA idx within the given tile.
+     *
+     * @param tile_iter_start  The starting tile iteration.
+     * @param cta_idx          The Stream-K workgroup index.
+     * @return index_t         The tile local workgroup index in the tile.
+     */
+    CK_TILE_DEVICE index_t get_tile_local_cta_index(index_t tile_iter_start,
+                                                    index_t cta_idx) const noexcept;
+
+    /**
+     * @brief Calculates the workgroup's 2D tile index in the C tensor given the 1D tile index.
      *
      * @param tile_idx  The 1D tile index in the C tensor for the workgroup.
      * @return index_t  The corresponding 2D tile index in the C tensor for the workgroup.
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp
similarity index 88%
rename from include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp
rename to include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp
index acc1860f1f..1764a1ce83 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp
@@ -61,13 +61,24 @@ StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_flags
     return sizeof(index_t) * sk_ctas_;
 }
 
+template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
+CK_TILE_DEVICE index_t
+StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_start_iter(
+    index_t cta_idx) const noexcept
+{
+    // Compute the number of extra iterations done before this CTA. If the cta_idx is less than
+    // extra_iters, the number of extra iterations before the CTA is exactly the cta_idx. Otherwise,
+    // it is extra_iters.
+    index_t extra_iters_before_me = ck_tile::min(cta_idx, extra_iters_);
+    return total_dp_iters_ + cta_idx * iters_per_sk_cta_ + extra_iters_before_me;
+}
+
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 CK_TILE_DEVICE void
 StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_iter_boundaries(
     index_t& iter, index_t& iter_end, index_t cta_idx) const noexcept
 {
-    index_t extra_iters_before_me = ck_tile::min(cta_idx, extra_iters_);
-    iter     = total_dp_iters_ + cta_idx * iters_per_sk_cta_ + extra_iters_before_me;
+    iter     = get_start_iter(cta_idx);
     iter_end = iter + iters_per_sk_cta_ + (cta_idx < extra_iters_);
 }
 
@@ -104,6 +115,24 @@ StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_local
     return ck_tile::min(iter_end, tile_iter_end) - tile_iter;
 }
 
+template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
+CK_TILE_DEVICE index_t
+StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_tile_local_cta_index(
+    index_t tile_iter_start, index_t cta_idx) const noexcept
+{
+    tile_iter_start = tile_iter_start - (dp_tiles_ * iters_per_tile_);
+
+    // Compute how many WGs fit before this tile starts assuming each WG does an
+    // extra_iter
+    const index_t num_extra_iter_ctas = tile_iter_start / (iters_per_sk_cta_ + 1);
+    // Compute how many WGs fit before this tile starts excluding extra iters
+    const index_t num_non_extra_iter_ctas = (tile_iter_start - extra_iters_) / iters_per_sk_cta_;
+    // Compute the CTA idx for the CTA that starts this tile
+    const index_t coop_group_start =
+        num_extra_iter_ctas < extra_iters_ ? num_extra_iter_ctas : num_non_extra_iter_ctas;
+    return cta_idx - coop_group_start;
+}
+
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 CK_TILE_DEVICE auto
 StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_output_tile_index(
@@ -121,7 +150,8 @@ CK_TILE_HOST_DEVICE index_t
 StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_workspace_size(
     index_t acc_element_bytes) const noexcept
 {
-    if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction)
+    if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction ||
+                 ReductionStrategy == StreamKReductionStrategy::TreeReduction)
     {
 
         return get_partials_buffer_size(acc_element_bytes) + get_flags_buffer_size();
diff --git a/test/ck_tile/gemm_streamk/CMakeLists.txt b/test/ck_tile/gemm_streamk/CMakeLists.txt
index d8b4ff945f..1390e5ee07 100644
--- a/test/ck_tile/gemm_streamk/CMakeLists.txt
+++ b/test/ck_tile/gemm_streamk/CMakeLists.txt
@@ -23,6 +23,9 @@ if(GPU_TARGETS MATCHES "gfx90a|gfx942|gfx950")
     #TODO: support all arches
     #TODO: current c-shuffle only supports C layout as R
     add_gtest_executable(test_ck_tile_streamk_tile_partitioner test_streamk_tile_partitioner.cpp)
+    add_gtest_executable(test_ck_tile_streamk_reduction
+                        ${CMAKE_CURRENT_SOURCE_DIR}/smoke_tests/test_gemm_streamk_fp16_reduction.cpp
+                        test_gemm_streamk_util.cpp)
     add_gtest_executable(test_ck_tile_streamk_smoke 
                         ${CMAKE_CURRENT_SOURCE_DIR}/smoke_tests/test_gemm_streamk_fp16_persistent.cpp
                         ${CMAKE_CURRENT_SOURCE_DIR}/smoke_tests/test_gemm_streamk_bf16_persistent.cpp
diff --git a/test/ck_tile/gemm_streamk/smoke_tests/test_gemm_streamk_fp16_reduction.cpp b/test/ck_tile/gemm_streamk/smoke_tests/test_gemm_streamk_fp16_reduction.cpp
new file mode 100644
index 0000000000..bcd4583da2
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/smoke_tests/test_gemm_streamk_fp16_reduction.cpp
@@ -0,0 +1,17 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_streamk_common_includes.hpp"
+
+template <typename Tuple>
+class TestCkTileStreamKFp16Reduction : public TestCkTileStreamK<Tuple>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileStreamKFp16Reduction
+
+TYPED_TEST_SUITE(TestCkTileStreamKFp16Reduction, KernelTypesStreamKFp16Reduction);
+
+#include "test_gemm_streamk_reduction_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc
new file mode 100644
index 0000000000..66c3e3b5e9
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc
@@ -0,0 +1,88 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_OneTile_Tree)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu;
+
+    this->Run(M, N, K, ck_tile::StreamKReductionStrategy::TreeReduction);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_OneTile)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu;
+
+    this->Run(M, N, K, ck_tile::StreamKReductionStrategy::Reduction);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_4Tiles_Tree)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 4;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu + (25 * K_Tile);
+
+    this->Run(M, N, K, ck_tile::StreamKReductionStrategy::TreeReduction);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_4Tiles_Reduction)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 4;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu + (25 * K_Tile);
+
+    this->Run(M, N, K, ck_tile::StreamKReductionStrategy::Reduction);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_21Tiles_Tree)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 3;
+    ck_tile::index_t N = N_Tile * 7;
+    ck_tile::index_t K = K_Tile * num_cu + (30 * K_Tile);
+
+    this->Run(M, N, K, ck_tile::StreamKReductionStrategy::TreeReduction);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_21Tiles)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 3;
+    ck_tile::index_t N = N_Tile * 7;
+    ck_tile::index_t K = K_Tile * num_cu + (30 * K_Tile);
+
+    this->Run(M, N, K, ck_tile::StreamKReductionStrategy::Reduction);
+}
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
index efb7416580..ece313b8aa 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
@@ -33,6 +33,14 @@ using KernelTypesStreamKFp16Persistent = ::testing::Types<
     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent>
 >;
 
+using KernelTypesStreamKFp16Reduction = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  Persistent
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent>>;
+
 using KernelTypesStreamKBf16Persistent = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent>,
     std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent>,
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
index 213702551a..540109a999 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
@@ -144,7 +144,11 @@ class TestCkTileStreamK : public ::testing::Test
 
             using Kernel = ck_tile::StreamKKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
 
-            auto kargs = Kernel::MakeKernelArgs(args);
+            auto kargs                = Kernel::MakeKernelArgs(args);
+            const auto workspace_size = Kernel::GetWorkSpaceSize(kargs);
+            ck_tile::DeviceMem workspace_data(workspace_size);
+            workspace_data.SetZero();
+            kargs.workspace_ptr = workspace_data.GetDeviceBuffer();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -184,11 +188,6 @@ class TestCkTileStreamK : public ::testing::Test
 
         using namespace ck_tile::literals;
 
-        if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction)
-        {
-            throw std::runtime_error("Reduction Strategy is current unsupported!\n");
-        }
-
         auto f_host_tensor_descriptor = [](std::size_t row,
                                            std::size_t col,
                                            std::size_t stride,
@@ -252,9 +251,25 @@ class TestCkTileStreamK : public ::testing::Test
                                       stride_B,
                                       stride_C};
 
-        ck_tile::index_t num_accumulations_per_tile =
-            invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
+        ck_tile::index_t num_accumulations_per_tile;
+
+        if(reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)
+        {
+            num_accumulations_per_tile = invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
                 args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
+        }
+        else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction)
+        {
+            num_accumulations_per_tile =
+                invoke_streamk<ck_tile::StreamKReductionStrategy::Reduction>(
+                    args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
+        }
+        else
+        {
+            num_accumulations_per_tile =
+                invoke_streamk<ck_tile::StreamKReductionStrategy::TreeReduction>(
+                    args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
+        }
 
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
 
diff --git a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp
index dd74efc27a..637f71c04f 100644
--- a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp
+++ b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp
@@ -372,6 +372,85 @@ TEST(StreamKTilePartitionerBaseGetOutputTileIndex, TestAllMappings)
     }
 }
 
+TEST(StreamKTilePartitionerBaseGetTileLocalCtaIndex, SKOnlyLargeK)
+{
+    /*
+    The StreamKTilePartitionerBaseConfigSKOnlyLargeK has the following form:
+    - tiles in the C tensor: 2
+    - iters_per_tile: 5
+    - grid: 5
+    - dp_tiles: 0
+    - sk_tiles: 2
+    - iters_per_sk_cta: 2
+    - extra_iters: 0
+
+    The tiles with iters are as follows:
+
+    tile_idx: __________0_________|_________1_________|
+    tile_iter:| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
+              |   |   |   |   |   |   |   |   |   |   |
+              <---------------SK Tiles--------------->|
+
+    From the above configuration, we get the following:
+    - SK CTA 0: tile_iter_start is 0 with local CTA index of 0 in tile 0
+    - SK CTA 1: tile_iter_start is 0 with local CTA index of 1 in tile 0
+    - SK CTA 2: tile_iter_start is 0 with local CTA index of 2 in tile 0
+    - SK CTA 2: tile_iter_start is 5 with local CTA index of 0 in tile 1
+    - SK CTA 3: tile_iter_start is 5 with local CTA index of 1 in tile 1
+    - SK CTA 4: tile_iter_start is 5 with local CTA index of 2 in tile 1
+    */
+
+    // Now we create a vector of triplets (tile_iter_start, cta_idx, tile_local_cta_idx) to test
+    std::vector<std::array<ck_tile::index_t, 3>> sk_triplets{
+        {0, 0, 0}, {0, 1, 1}, {0, 2, 2}, {5, 2, 0}, {5, 3, 1}, {5, 4, 2}};
+
+    for(const auto& triplet : sk_triplets)
+    {
+        const auto& [tile_iter_start, cta_idx, tile_local_cta_idx] = triplet;
+        test_get_tile_local_cta_idx<StreamKTilePartitionerBaseConfigSKOnlyLargeK>(
+            tile_iter_start, cta_idx, tile_local_cta_idx);
+    }
+}
+
+TEST(StreamKTilePartitionerBaseGetTileLocalCtaIndex, DP2TileSK)
+{
+    /*
+    The StreamKTilePartitionerBaseConfigDP2TileSK has the following form:
+    - tiles in the C tensor: 7
+    - iters_per_tile: 3
+    - grid: 3
+    - dp_tiles: 3
+    - sk_tiles: 4
+    - iters_per_sk_cta: 2
+    - extra_iters: 2
+
+    The tiles with iters are as follows:
+
+    tile_idx: ____0___|___1___|___2___|___3___|___4___|____5____|____6____|
+    tile_iter:| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
+              |   |   |   |   |   |   |   |   |   |   |    |    |    |    |
+              |<-------DP Tiles------>|<------------SK Tiles------------->|
+
+    From the above configuration, we get the following:
+    - SK CTA 0: tile_iter_start is 6 with local CTA index of 0 in tile 3
+    - SK CTA 0: tile_iter_start is 8 with local CTA index of 0 in tile 4
+    - SK CTA 1: tile_iter_start is 8 with local CTA index of 1 in tile 4
+    - SK CTA 1: tile_iter_start is 10 with local CTA index of 0 in tile 5
+    - SK CTA 2: tile_iter_start is 12 with local CTA index of 0 in tile 6
+    */
+
+    // Now we create a vector of triplets (tile_iter_start, cta_idx, tile_local_cta_idx) to test
+    std::vector<std::array<ck_tile::index_t, 3>> sk_triplets{
+        {6, 0, 0}, {8, 0, 0}, {8, 1, 1}, {10, 1, 0}, {12, 2, 0}};
+
+    for(const auto& triplet : sk_triplets)
+    {
+        const auto& [tile_iter_start, cta_idx, tile_local_cta_idx] = triplet;
+        test_get_tile_local_cta_idx<StreamKTilePartitionerBaseConfigDP2TileSK>(
+            tile_iter_start, cta_idx, tile_local_cta_idx);
+    }
+}
+
 // Persistent
 TEST(StreamKTilePartitioner_PersistentConstructor, SKOnly)
 {
diff --git a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp
index 0bb0940651..3daec049a7 100644
--- a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp
+++ b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp
@@ -4,6 +4,7 @@
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gtest/gtest.h"
+#include <array>
 
 enum StreamKTilePartitionerBaseMethodId
 {
@@ -12,7 +13,8 @@ enum StreamKTilePartitionerBaseMethodId
     GET_TILE_BOUNDARIES,
     GET_TILE_INDEX,
     GET_ITER_BOUNDARIES,
-    GET_OUTPUT_TILE_INDEX
+    GET_OUTPUT_TILE_INDEX,
+    GET_TILE_LOCAL_CTA_INDEX
 };
 
 // Base kernel wrapper class to facilitate testing class device functions.
@@ -136,6 +138,22 @@ struct KernelWrapperSpecialized<TilePartitioner,
     }
 };
 
+template <typename TilePartitioner>
+struct KernelWrapperSpecialized<TilePartitioner,
+                                StreamKTilePartitionerBaseMethodId::GET_TILE_LOCAL_CTA_INDEX>
+    : public KernelWrapper<TilePartitioner>
+{
+
+    using Base = KernelWrapper<TilePartitioner>;
+
+    CK_TILE_DEVICE void operator()(typename Base::KernelArgs kargs)
+    {
+        ck_tile::index_t tile_local_cta_index =
+            kargs.tile_partitioner.get_tile_local_cta_index(kargs.arg1, kargs.arg2);
+        *(static_cast<ck_tile::index_t*>(kargs.result1)) = tile_local_cta_index;
+    }
+};
+
 struct StreamKTilePartitionerBaseExpected
 {
     ck_tile::index_t sk_tiles_;
@@ -243,6 +261,22 @@ struct StreamKTilePartitionerBaseConfigSKOnly : public StreamKTilePartitionerBas
                                              ck_tile::sequence<UNUSED, UNUSED, UNUSED>>;
 };
 
+struct StreamKTilePartitionerBaseConfigSKOnlyLargeK : public StreamKTilePartitionerBaseConfig
+{
+    static constexpr ck_tile::index_t M    = 8;
+    static constexpr ck_tile::index_t N    = 2;
+    static constexpr ck_tile::index_t K    = 10;
+    static constexpr ck_tile::index_t GRID = 5;
+
+    static constexpr ck_tile::index_t M_TILE = 4;
+    static constexpr ck_tile::index_t N_TILE = 2;
+    static constexpr ck_tile::index_t K_TILE = 2;
+
+    using GemmShape = ck_tile::TileGemmShape<ck_tile::sequence<M_TILE, N_TILE, K_TILE>,
+                                             ck_tile::sequence<UNUSED, UNUSED, UNUSED>,
+                                             ck_tile::sequence<UNUSED, UNUSED, UNUSED>>;
+};
+
 struct StreamKTilePartitionerBaseConfigEdgeCase : public StreamKTilePartitionerBaseConfig
 {
 
@@ -314,6 +348,38 @@ void test_get_output_tile_index(ck_tile::index_t tile_idx,
     EXPECT_EQ(in, in_expected);
 };
 
+template <typename Config>
+void test_get_tile_local_cta_idx(ck_tile::index_t tile_iter_start,
+                                 ck_tile::index_t cta_idx,
+                                 ck_tile::index_t expected_tile_local_cta_idx)
+{
+    // Types
+    using TilePartitioner = ck_tile::StreamKTilePartitionerBase<typename Config::GemmShape>;
+    using Kernel =
+        KernelWrapperSpecialized<TilePartitioner,
+                                 StreamKTilePartitionerBaseMethodId::GET_TILE_LOCAL_CTA_INDEX>;
+
+    // Test parameters
+    ck_tile::StreamKTilePartitionerBase<typename Config::GemmShape> tile_partitioner{
+        Config::M, Config::N, Config::K, Config::GRID};
+    ck_tile::DeviceMem tile_local_cta_idx_dev(sizeof(ck_tile::index_t));
+
+    // Launch kernel
+    auto kargs = Kernel::MakeKernelArgs(tile_iter_start,
+                                        cta_idx,
+                                        Config::UNUSED,
+                                        tile_local_cta_idx_dev.GetDeviceBuffer(),
+                                        nullptr,
+                                        tile_partitioner);
+    ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0, 0, 1},
+                           ck_tile::make_kernel<1>(Kernel{}, 1, 1, 0, kargs));
+
+    // Validate results
+    ck_tile::index_t tile_local_cta_idx;
+    tile_local_cta_idx_dev.FromDevice(&tile_local_cta_idx);
+    EXPECT_EQ(tile_local_cta_idx, expected_tile_local_cta_idx);
+}
+
 // Configs for TilePartitioner Child structs
 struct StreamKTilePartitionerV2PersistentExpected
 {

From f5573f56d9d4981def16f575ddb14535b93bb9bb Mon Sep 17 00:00:00 2001
From: Linjun-AMD <Jun.Lin@amd.com>
Date: Mon, 15 Dec 2025 12:21:59 +0800
Subject: [PATCH 09/10] Add attention sink support for FMHA FWD (#3368)

* Revert "Revert "Add attn sink (#2892)" (#3250)"

This reverts commit 5adaa201eda9337553459bc4321b11695e380832.

* fix conflict

Signed-off-by: Linjun-AMD <Jun.Lin@amd.com>

* Add F_sink parameter to FmhaFwdPipeline

* Update tile_fmha_traits.hpp

* Refactor pipeline creation in fmha_fwd.py

Updated the pipeline creation logic to include 'sink' parameter in product combinations and adjusted the FmhaFwdPipeline calls accordingly.

* Update fmha_fwd.py

* Update fmha_fwd.py

* Update example/ck_tile/01_fmha/script/correct_test_fwd_sink.sh

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* update CHANGELOG.md

Signed-off-by: Linjun-AMD <Jun.Lin@amd.com>

* Update CHANGELOG with new features and support

* Update fmha_fwd.hpp

* Update CHANGELOG.md

* Update smoke_test_fwd_sink.sh

* Update correct_test_fwd_sink.sh

* Update smoke_test_fwd_sink.sh

---------

Signed-off-by: Linjun-AMD <Jun.Lin@amd.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 CHANGELOG.md                                  |   1 +
 example/ck_tile/01_fmha/CMakeLists.txt        |   2 +-
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  75 +++++---
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |  42 +++--
 .../codegen/ops/fmha_pagedkv_prefill.py       |  33 +++-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |  25 ++-
 example/ck_tile/01_fmha/fmha_fwd_runner.hpp   |   6 +-
 example/ck_tile/01_fmha/mask.hpp              |  42 ++++-
 .../01_fmha/script/correct_test_fwd_sink.sh   |  77 ++++++++
 .../ck_tile/01_fmha/script/run_full_test.sh   |   1 +
 .../01_fmha/script/smoke_test_fwd_sink.sh     |  86 +++++++++
 .../reference/reference_batched_masking.hpp   |   2 +-
 .../ck_tile/ops/fmha/block/block_masking.hpp  | 178 ++++++++++++++++--
 include/ck_tile/ops/fmha/block/variants.hpp   |  33 ++++
 .../fmha/kernel/fmha_batch_prefill_kernel.hpp |   7 +-
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |  17 +-
 .../fmha/kernel/fmha_fwd_pagedkv_kernel.hpp   |  14 +-
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |  10 +-
 ...ock_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp |  94 ++++++---
 ...litkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp |  86 ++++++---
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp |  92 +++++++--
 .../pipeline/block_fmha_pipeline_problem.hpp  |   3 +
 .../pipeline/block_fmha_pipeline_qr_ks_vs.hpp |  99 ++++++++--
 .../block_fmha_pipeline_qr_ks_vs_async.hpp    | 102 +++++++---
 ...ck_fmha_pipeline_qr_ks_vs_async_trload.hpp |   1 +
 .../ops/fmha/pipeline/tile_fmha_traits.hpp    |  16 +-
 26 files changed, 948 insertions(+), 196 deletions(-)
 create mode 100644 example/ck_tile/01_fmha/script/correct_test_fwd_sink.sh
 create mode 100755 example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 15fdb09f49..997fb8bb8c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ### Added
 * Added support for explicit GEMM in CK_TILE grouped convolution forward and backward weight.
 * Added TF32 convolution support on gfx942 and gfx950 in CK. It could be enabled/disabled via `DTYPES` of "tf32".
+* Added attention sink support for FMHA FWD, include qr_ks_vs, qr_async and splitkv pipelines.
 
 ### Changed
 
diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index 6e7d69281d..9c81207361 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -65,7 +65,7 @@ set(FMHA_BWD_CODE_GEN_COMMON_ARGS
 # there is no corresponding instance for parameters).
 if(BUILD_TESTING)
   # Filters are in the order of FMHA_FWD_KNOWN_APIS: fwd,fwd_splitkv_combine@fwd_splitkv,fwd_appendkv,pagedkv_prefill
-  list(APPEND FMHA_FWD_CODE_GEN_COMMON_ARGS --filter *_nlogits*_nskip*,*@*_nlogits*_nbias*,*,*_nlogits*_nskip*_pagedkv)
+  list(APPEND FMHA_FWD_CODE_GEN_COMMON_ARGS --filter *_nlogits*_nskip*_nsink*,*@*_nlogits*_nbias*_nsink*,*,*_nlogits*_nskip*_pagedkv*)
 endif()
 
 # generate a list of kernels, but not actually emit files at config sta
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index edc0e049c5..4d6900a802 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -76,7 +76,8 @@ using fmha_traits = ck_tile::TileFmhaTraits<{F_spad},
                                             {F_dropout},
                                             {F_qscale},
                                             {F_occupancy},
-                                            {F_skip}>;
+                                            {F_skip},
+                                            {F_sink}>;
 
 using fmha_variant = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
 
@@ -113,7 +114,7 @@ using fmha_kernel = {F_kernel}<fmha_pipeline, fmha_epilogue>;
 
 
 using trait = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask, {F_bias}, {F_lse}, {F_dropout}, {F_qscale}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask, {F_bias}, {F_lse}, {F_dropout}, {F_qscale}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}, {F_sink}>;
 
 template<>
 float fmha_fwd_<trait, {F_arch.tag}>(const ck_tile::stream_config& s, fmha_fwd_args a)
@@ -229,9 +230,9 @@ FMHA_FWD_API_PER_HDIM_CASE = """{F_if}(t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hd
 }}
 """
 
-FMHA_FWD_API_INNER_DISPATCH = """{F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.qscale_type == {F_qscale_check}) && (t.skip_min_seqlen_q == {F_skip}) &&
+FMHA_FWD_API_INNER_DISPATCH = """{F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.qscale_type == {F_qscale_check}) && (t.skip_min_seqlen_q == {F_skip}) &&(t.has_sink == {F_sink}) &&
         ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
-    using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_qscale}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
+    using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_qscale}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}, {F_sink}>;
     return fmha_fwd_<trait_, {F_arch.tag}>(s, a);
 }}
 """
@@ -278,13 +279,14 @@ class FmhaFwdApiTrait:
     dvpad: str
     skip: str
     tr_load: str
+    sink: str
     constraint: CppConstraint
 
     @property
     def name(self) -> str:
         return (
             f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-"
-            + f"{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.qscale}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}"
+            + f"{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.qscale}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}-{self.sink}"
         )
 
     @property
@@ -384,6 +386,7 @@ class FmhaFwdPipeline:
     F_mask: str  # value from MASK_MAP
     F_skip: str  # true/false
     F_trload: str  # true/false
+    F_sink: str  # true/false
     F_constraint: CppConstraint = field(default_factory=lambda: CppConstraint())
 
     @property
@@ -454,6 +457,10 @@ class FmhaFwdPipeline:
             n += "_trload"
         else:
             n += "_ntrload"
+        if self.F_sink == "t":
+            n += "_sink"
+        else:
+            n += "_nsink"
 
         return n
 
@@ -543,6 +550,7 @@ class FmhaFwdApiPool:
                             F_trload=BOOL_MAP[trait.tr_load],
                             F_qscale_check=QSCALE_CHECK_MAP[trait.qscale],
                             F_qscale=QSCALE_MAP[trait.qscale],
+                            F_sink=BOOL_MAP[trait.sink],
                             F_scheck=trait.scheck,
                             F_seqtune=trait.seqtune(max_bm0),
                             F_skcheck=trait.skcheck,
@@ -683,6 +691,7 @@ class FmhaFwdKernel:
             F_pipeline=PIPELINE_MAP[self.F_pipeline.tag],
             F_kernel=self._get_cpp_kernel_class_name(self.F_pipeline.tag),
             F_kargs_creator=self._get_cpp_kargs_creator_func_name(self.F_pipeline.tag),
+            F_sink=BOOL_MAP[self.F_pipeline.F_sink],
         )
 
     @property
@@ -725,6 +734,7 @@ class FmhaFwdKernel:
             dvpad=self.F_pipeline.F_dvpad,
             skip=self.F_pipeline.F_skip,
             tr_load=self.F_pipeline.F_trload,
+            sink=self.F_pipeline.F_sink,
             constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint,
         )
 
@@ -957,52 +967,55 @@ class KernelComponentFactoryGfx9(CompatibilityRuleFactoryGfx9):
         pipelines = []
         if dtype in cls._DT_FP32:
             qscale = "no"
-            for logits, mask, bias, lse, dropout, skip in itertools.product(
+            for logits, mask, bias, lse, dropout, skip, sink in itertools.product(
                 ["t", "f"],
                 get_mask_map(mask_impl).keys(),
                 BIAS_MAP.keys(),
                 ["t", "f"],
                 ["t", "f"],
                 ["t", "f"],
+                ["t", "f"],
             ):
-                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "t", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "t", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
         elif dtype in cls._DT_FP16_BF16:
             qscale = "no"
-            for logits, mask, bias, lse, dropout, skip in itertools.product(
+            for logits, mask, bias, lse, dropout, skip, sink in itertools.product(
                 ["t", "f"],
                 get_mask_map(mask_impl).keys(),
                 BIAS_MAP.keys(),
                 ["t", "f"],
                 ["t", "f"],
                 ["t", "f"],
+                ["t", "f"],
             ):
                 if hdim == 256 and hdim_v == 256:
-                    pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
+                    pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                     # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
-                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
+                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                 else:
                     if bias == "bias":
                         # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
-                        pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
+                        pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                        pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                     else:
-                        pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
-                        pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
+                        pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                        pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                     if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip # TODO: cover arbitraty hdim# fmt: skip
+                        pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip # TODO: cover arbitraty hdim# fmt: skip
         elif dtype in cls._DT_FP8BF16 or dtype in cls._DT_FP8FP32:
             # no need lse/dropout kernels
-            for logits, qscale, mask, bias in itertools.product(
+            for logits, qscale, mask, bias, sink in itertools.product(
                 ["f"],
                 ["no", "pertensor"],
                 get_mask_map(mask_impl).keys(),
                 ["no"],
+                ["f", "t"],
             ):
-                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f"))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "f", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", sink))  # fmt: skip
         elif dtype in ["fp8", "fp8fp16", "bf8"]:
             # TODO
             pass
@@ -1033,13 +1046,14 @@ class KernelComponentFactoryGfx950(
         )
         if dtype in cls._DT_FP16_BF16:
             qscale = "no"
-            for logits, mask, bias, lse, dropout, skip in itertools.product(
+            for logits, mask, bias, lse, dropout, skip, sink in itertools.product(
                 ["t", "f"],
                 get_mask_map(mask_impl).keys(),
                 BIAS_MAP.keys(),
                 ["t", "f"],
                 ["t", "f"],
                 ["t", "f"],
+                ["t", "f"],
             ):
                 if (
                     (hdim, hdim_v) in [(64, 64), (128, 128)]
@@ -1048,15 +1062,15 @@ class KernelComponentFactoryGfx950(
                     and dropout == "f"
                     and skip == "f"
                 ):
-                    pipelines.append(FmhaFwdPipeline("qr_async_trload", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "t"))  # fmt: skip
-                    pipelines.append(FmhaFwdPipeline("qr_async_trload", "row", "f", "f", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "t"))  # fmt: skip
+                    pipelines.append(FmhaFwdPipeline("qr_async_trload", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "t", sink))  # fmt: skip
+                    pipelines.append(FmhaFwdPipeline("qr_async_trload", "row", "f", "f", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "t", sink))  # fmt: skip
 
             # qr_async_trload_v3 only supports hdim=hdim_v=128 for now
             if (hdim, hdim_v) == (128, 128):
                 # qr_async_trload_v3 only supports (generic) causal mask
                 for mask in ["no", "causal"]:
                     pipelines.append(FmhaFwdPipeline("qr_async_trload_v3", "row", "t", "t", "f", "f",
-                        F_logits="f", F_bias="no", F_lse="f", F_dropout="f", F_qscale=qscale, F_mask=mask, F_skip="f", F_trload="t"))  # fmt: skip
+                        F_logits="f", F_bias="no", F_lse="f", F_dropout="f", F_qscale=qscale, F_mask=mask, F_skip="f", F_trload="t", F_sink="f"))  # fmt: skip
 
         return pipelines
 
@@ -1105,23 +1119,24 @@ class KernelComponentFactoryGfx12(CompatibilityRuleFactory):
         pipelines = []
         if dtype in cls._DT_FP16_BF16:
             qscale = "no"
-            for logits, mask, bias, lse, dropout, skip in itertools.product(
+            for logits, mask, bias, lse, dropout, skip, sink in itertools.product(
                 ["t", "f"],
                 get_mask_map(mask_impl).keys(),
                 BIAS_MAP.keys(),
                 ["t", "f"],
                 ["t", "f"],
                 ["t", "f"],
+                ["t", "f"],
             ):
-                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
         elif dtype in cls._DT_FP8_FP8BF16 or dtype in cls._DT_FP8FP32:
             # no need lse/dropout kernels
             for logits, qscale, mask, bias in itertools.product(
                 ["f"], ["no", "pertensor"], get_mask_map(mask_impl).keys(), ["no"]
             ):
-                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, "f", "f", qscale, mask, "f", "f"))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, "f", "f", qscale, mask, "f", "f", "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, "f", "f", "f"))  # fmt: skip
         return pipelines
 
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 342a71e0b0..9105900fc7 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -73,7 +73,8 @@ using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
                                                      {F_pagedkv},
                                                      kHasUnevenSplits,
                                                      kMergeNumHeadGroupsSeqLenQ,
-                                                     {F_occupancy}>;
+                                                     {F_occupancy},
+                                                     {F_sink}>;
 
 using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
     typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
@@ -117,7 +118,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }} // anonymous namespace
 
 using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_sink}, {F_spad}, {F_skpad}, {F_dpad},
                         {F_dvpad}>;
 
 #pragma clang diagnostic push
@@ -279,8 +280,8 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 """
 
 FMHA_FWD_SPLITKV_API_INNER_DISPATCH = """{F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
-        ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-    using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+        ((a.block_table_ptr != nullptr) == {F_pagedkv}) && (t.has_sink == {F_sink}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+    using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv},{F_sink}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
 
     // get combine kernel tile sizes
     using OaccDataType = typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType;
@@ -332,6 +333,7 @@ class FmhaFwdSplitKVApiTrait:
     dpad: str
     dvpad: str
     pagedkv: str
+    sink: str  # sink or not
     bn1comb: int  # tile size along v head_dim of combine kernel
 
     @property
@@ -339,7 +341,7 @@ class FmhaFwdSplitKVApiTrait:
         return (
             f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-"
             + f"{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-"
-            + f"{self.dvpad}-{self.pagedkv}"
+            + f"{self.dvpad}-{self.pagedkv}-{self.sink}"
         )
 
     @property
@@ -425,6 +427,7 @@ class FmhaFwdSplitKVPipeline:
     F_lse: str  #
     F_squant: str  #
     F_pagedkv: str  # t/f
+    F_sink: str  # t/f
     F_mask: str  # value from MASK_MAP
 
     @property
@@ -485,6 +488,10 @@ class FmhaFwdSplitKVPipeline:
             n += "_pagedkv"
         else:
             n += "_npagedkv"
+        if self.F_sink == "t":
+            n += "_sink"
+        else:
+            n += "_nsink"
         return n
 
 
@@ -567,6 +574,7 @@ class FmhaFwdSplitKVApiPool:
                             F_lse=BOOL_MAP[trait.lse],
                             F_squant=BOOL_MAP[trait.squant],
                             F_pagedkv=BOOL_MAP[trait.pagedkv],
+                            F_sink=BOOL_MAP[trait.sink],
                             F_scheck=trait.scheck,
                             F_skcheck=trait.skcheck,
                             F_dcheck=trait.dcheck,
@@ -667,6 +675,7 @@ class FmhaFwdSplitKVKernel:
             F_squant=BOOL_MAP[self.F_pipeline.F_squant],
             F_pagedkv=BOOL_MAP[self.F_pipeline.F_pagedkv],
             F_occupancy=self.F_tile.F_occupancy,
+            F_sink=BOOL_MAP[self.F_pipeline.F_sink],
             F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag],
             F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
             F_mode=MODE_MAP[self.F_mode],
@@ -740,19 +749,23 @@ class KernelComponentFactoryBase:
         squant = "t" if dtype == "fp8" else "f"
         pipelines = []
         if dtype in ["fp16", "bf16"]:
-            for logits, mask, bias, pagedkv in itertools.product(
-                ["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]
+            for logits, mask, bias, pagedkv, sink in itertools.product(
+                ["t", "f"],
+                get_mask_map(mask_impl).keys(),
+                BIAS_MAP.keys(),
+                ["t", "f"],
+                ["t", "f"],
             ):
-                pipelines.append(Pipeline("qr", "row", "f", "t", "f", "f", logits, bias, "t", squant, pagedkv, mask))  # fmt: skip
-                pipelines.append(Pipeline("qr", "row", "t", "f", "f", "f", logits, bias, "t", squant, pagedkv, mask))  # fmt: skip
-                pipelines.append(Pipeline("qr", "row", "t", "t", "f", "f", logits, bias, "t", squant, pagedkv, mask))  # fmt: skip
-                pipelines.append(Pipeline("qr", "row", "t", "t", "t", "t", logits, bias, "t", squant, pagedkv, mask))  # fmt: skip
+                pipelines.append(Pipeline("qr", "row", "f", "t", "f", "f", logits, bias, "t", squant, pagedkv, sink, mask))  # fmt: skip
+                pipelines.append(Pipeline("qr", "row", "t", "f", "f", "f", logits, bias, "t", squant, pagedkv, sink, mask))  # fmt: skip
+                pipelines.append(Pipeline("qr", "row", "t", "t", "f", "f", logits, bias, "t", squant, pagedkv, sink, mask))  # fmt: skip
+                pipelines.append(Pipeline("qr", "row", "t", "t", "t", "t", logits, bias, "t", squant, pagedkv, sink, mask))  # fmt: skip
         elif dtype in ["fp8", "bf8"]:
             for logits, mask, bias in itertools.product(
                 ["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()
             ):
-                pipelines.append(Pipeline("qr", "row", "f", "f", "f", "f", logits, bias, "t", squant, "f", mask))  # fmt: skip
-                pipelines.append(Pipeline("qr", "row", "t", "t", "f", "f", logits, bias, "t", squant, "f", mask))  # fmt: skip
+                pipelines.append(Pipeline("qr", "row", "f", "f", "f", "f", logits, bias, "t", squant, "f", "f", mask))  # fmt: skip
+                pipelines.append(Pipeline("qr", "row", "t", "t", "f", "f", logits, bias, "t", squant, "f", "f", mask))  # fmt: skip
         elif dtype in ["fp8fp16", "fp8bf16"]:
             # TODO
             None
@@ -908,6 +921,7 @@ def get_fwd_splitkv_blobs(
                     cond &= pipeline.F_vlayout == "row"
                     cond &= pipeline.F_bias in ["no", "alibi"]
                     cond &= pipeline.F_squant == "f"
+                    cond &= pipeline.F_sink == "f"
                     if not cond:
                         continue
                 # PyTorch integration
@@ -917,6 +931,7 @@ def get_fwd_splitkv_blobs(
                     cond &= pipeline.F_bias in ["no", "bias"]
                     cond &= pipeline.F_squant == "f"
                     cond &= mode == "batch"
+                    cond &= pipeline.F_sink == "f"
                     if not cond:
                         continue
                 # Aiter(mha_varlen_fwd) integration
@@ -1075,6 +1090,7 @@ def write_blobs(
                 lse=kernel.F_pipeline.F_lse,
                 squant=kernel.F_pipeline.F_squant,
                 pagedkv=kernel.F_pipeline.F_pagedkv,
+                sink=kernel.F_pipeline.F_sink,
                 spad=kernel.F_pipeline.F_spad,
                 skpad=kernel.F_pipeline.F_skpad,
                 dpad=kernel.F_pipeline.F_dpad,
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
index e6eb893a2f..cdb43c3480 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
@@ -65,7 +65,8 @@ using fmha_trait_{F_idx} = ck_tile::TileFmhaFwdPagedKVTraits<{F_spad},
                                                              {F_pagedkv},  //pagedkv
                                                              {F_squant},
                                                              {F_occupancy},
-                                                             {F_skip}>;
+                                                             {F_skip},
+                                                             {F_sink}>;
 
 using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
 
@@ -100,7 +101,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaFwdPagedKVKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}, {F_sink}>;
 
 template<>
 float fmha_fwd_pagedkv_<trait_{F_idx}, {F_arch.tag}>(const ck_tile::stream_config& s, fmha_fwd_pagedkv_args a)
@@ -129,9 +130,9 @@ float fmha_fwd_pagedkv(fmha_fwd_pagedkv_traits& t, fmha_fwd_pagedkv_args& a, con
 }}
 """
 
-FMHA_FWD_API_INNER_DISPATCH = """{F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.use_pagedkv == {F_pagedkv}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
+FMHA_FWD_API_INNER_DISPATCH = """{F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.use_pagedkv == {F_pagedkv}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) && (t.has_sink == {F_sink}) &&
         ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-    using trait_ = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+    using trait_ = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip},{F_sink}>;
     return fmha_fwd_pagedkv_<trait_, {F_arch.tag}>(s, a);
 }}
 """
@@ -163,12 +164,13 @@ class FmhaFwdApiTrait:
     dpad: str
     dvpad: str
     skip: str
+    sink: str
 
     @property
     def name(self) -> str:
         return (
             f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-"
-            + f"{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.pagedkv}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}"
+            + f"{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.pagedkv}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}-{self.sink}"
         )
 
     @property
@@ -256,6 +258,7 @@ class FmhaFwdPipeline:
     F_squant: str  #
     F_mask: str  # value from MASK_MAP
     F_skip: str  # true/false
+    F_sink: str  # true/false
 
     @property
     def name(self) -> str:
@@ -320,6 +323,10 @@ class FmhaFwdPipeline:
             n += "_pagedkv"
         else:
             n += "_npagedkv"
+        if self.F_sink == "t":
+            n += "_sink"
+        else:
+            n += "_nsink"
 
         return n
 
@@ -363,6 +370,7 @@ class FmhaFwdApiPool:
                             F_lse=BOOL_MAP[trait.lse],
                             F_pagedkv=BOOL_MAP[trait.pagedkv],
                             F_skip=BOOL_MAP[trait.skip],
+                            F_sink=BOOL_MAP[trait.sink],
                             F_squant=BOOL_MAP[trait.squant],
                             F_scheck=trait.scheck,
                             F_skcheck=trait.skcheck,
@@ -480,6 +488,7 @@ class FmhaFwdKernel:
             F_pagedkv=BOOL_MAP[self.F_pipeline.F_pagedkv],
             F_squant=BOOL_MAP[self.F_pipeline.F_squant],
             F_skip=BOOL_MAP[self.F_pipeline.F_skip],
+            F_sink=BOOL_MAP[self.F_pipeline.F_sink],
             F_occupancy=self.F_tile.F_occupancy,
             F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag],
             F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
@@ -526,6 +535,7 @@ class FmhaFwdKernel:
             dpad=self.F_pipeline.F_dpad,
             dvpad=self.F_pipeline.F_dvpad,
             skip=self.F_pipeline.F_skip,
+            sink=self.F_pipeline.F_sink,
         )
 
 
@@ -539,22 +549,23 @@ class KernelComponentFactoryBase:
         squant = "t" if dtype == "fp8" else "f"
         pipelines = []
         if dtype in ["fp16", "bf16"]:
-            for logits, mask, bias, pagedkv, skip in itertools.product(
+            for logits, mask, bias, pagedkv, skip, sink in itertools.product(
                 ["t", "f"],
                 get_mask_map(mask_impl).keys(),
                 BIAS_MAP.keys(),
                 ["t"],
                 ["f"],
+                ["t", "f"],
             ):
-                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "f", "f", "f", logits, bias, "f", pagedkv, squant, mask, skip))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "t", "f", "f", logits, bias, "f", pagedkv, squant, mask, skip))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "f", "f", "f", logits, bias, "f", pagedkv, squant, mask, skip, sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "t", "f", "f", logits, bias, "f", pagedkv, squant, mask, skip, sink))  # fmt: skip
         elif dtype in ["fp8", "bf8"]:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(
                 ["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()
             ):
-                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "f", "f", "f", "f", logits, bias, "f", "t", squant, mask, "f"))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "t", "f", "f", logits, bias, "f", "t", squant, mask, "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "f", "f", "f", "f", logits, bias, "f", "t", squant, mask, "f", "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "t", "f", "f", logits, bias, "f", "t", squant, mask, "f", "f"))  # fmt: skip
         elif dtype in ["fp8fp16", "fp8bf16"]:
             pass  # TODO
         else:
@@ -678,6 +689,7 @@ def get_fwd_blobs(
                     cond &= pipeline.F_bias in ["no", "alibi"]
                     cond &= pipeline.F_squant == "f"
                     cond &= pipeline.F_skip == "f"
+                    cond &= pipeline.F_sink == "f"
                     if not cond:
                         continue
                 # PyTorch integration
@@ -687,6 +699,7 @@ def get_fwd_blobs(
                     cond &= pipeline.F_bias in ["no", "bias"]
                     cond &= pipeline.F_squant == "f"
                     cond &= pipeline.F_skip == "f"
+                    cond &= pipeline.F_sink == "f"
                     if not cond:
                         continue
                 # Aiter(mha_fwd) integration
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 002d0a1035..60ba334fc0 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -266,6 +266,7 @@ struct fmha_fwd_args
 
     ck_tile::index_t window_size_left;
     ck_tile::index_t window_size_right;
+    ck_tile::index_t sink_size;
     ck_tile::index_t mask_type;
     ck_tile::index_t min_seqlen_q;
 
@@ -352,6 +353,7 @@ struct fmha_fwd_pagedkv_args
 
     ck_tile::index_t window_size_left;
     ck_tile::index_t window_size_right;
+    ck_tile::index_t sink_size;
     ck_tile::index_t mask_type;
     ck_tile::index_t min_seqlen_q;
 };
@@ -442,6 +444,7 @@ struct fmha_fwd_splitkv_args
 
     ck_tile::index_t window_size_left;
     ck_tile::index_t window_size_right;
+    ck_tile::index_t sink_size;
     ck_tile::index_t mask_type;
 };
 
@@ -561,6 +564,7 @@ struct fmha_batch_prefill_args
 
     ck_tile::index_t window_size_left;
     ck_tile::index_t window_size_right;
+    ck_tile::index_t sink_size;
     ck_tile::index_t mask_type;
 
     float p_drop;
@@ -613,6 +617,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.nhead_stride_o,
                                              args.window_size_left,
                                              args.window_size_right,
+                                             args.sink_size,
                                              args.mask_type,
                                              args.min_seqlen_q,
                                              args.p_drop,
@@ -663,6 +668,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.batch_stride_o,
                                              args.window_size_left,
                                              args.window_size_right,
+                                             args.sink_size,
                                              args.mask_type,
                                              args.p_drop,
                                              args.s_randval,
@@ -824,6 +830,7 @@ auto fmha_fwd_pagedkv_create_kargs_and_grids(fmha_fwd_pagedkv_args args)
                                          args.batch_stride_v,
                                          args.window_size_left,
                                          args.window_size_right,
+                                         args.sink_size,
                                          args.mask_type,
                                          args.min_seqlen_q);
         }
@@ -869,6 +876,7 @@ auto fmha_fwd_pagedkv_create_kargs_and_grids(fmha_fwd_pagedkv_args args)
                                          args.batch_stride_o,
                                          args.window_size_left,
                                          args.window_size_right,
+                                         args.sink_size,
                                          args.mask_type);
         }
     }();
@@ -935,6 +943,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                      args.split_stride_o_acc,
                                      args.window_size_left,
                                      args.window_size_right,
+                                     args.sink_size,
                                      args.mask_type);
         }
         else
@@ -982,6 +991,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                      args.split_stride_o_acc,
                                      args.window_size_left,
                                      args.window_size_right,
+                                     args.sink_size,
                                      args.mask_type);
         }
     }();
@@ -1142,6 +1152,7 @@ auto fmha_batch_prefill_create_kargs_and_grids(fmha_batch_prefill_args args)
                                          args.batch_stride_v,
                                          args.window_size_left,
                                          args.window_size_right,
+                                         args.sink_size,
                                          args.mask_type,
                                          args.p_drop,
                                          args.s_randval,
@@ -1194,6 +1205,7 @@ auto fmha_batch_prefill_create_kargs_and_grids(fmha_batch_prefill_args args)
                                          args.batch_stride_o,
                                          args.window_size_left,
                                          args.window_size_right,
+                                         args.sink_size,
                                          args.mask_type,
                                          args.p_drop,
                                          args.s_randval,
@@ -1228,7 +1240,8 @@ template <ck_tile::index_t HDim_,
           bool kPadD_,
           bool kPadDv_,
           bool kUseTrLoad_,
-          bool kSkipMinSeqlenQ_ = false>
+          bool kSkipMinSeqlenQ_ = false,
+          bool kHasSink_        = false>
 struct fmha_fwd_traits_
 {
     static constexpr ck_tile::index_t HDim           = HDim_;
@@ -1254,6 +1267,7 @@ struct fmha_fwd_traits_
     static constexpr bool kPadDv                     = kPadDv_;
     static constexpr bool kUseTrLoad                 = kUseTrLoad_;
     static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
+    static constexpr bool kHasSink                   = kHasSink_;
 };
 
 template <typename Traits_, typename Arch = void>
@@ -1280,7 +1294,8 @@ template <ck_tile::index_t HDim_,
           bool kPadSK_,
           bool kPadD_,
           bool kPadDv_,
-          bool kSkipMinSeqlenQ_ = false>
+          bool kSkipMinSeqlenQ_ = false,
+          bool kHasSink_        = false>
 struct fmha_fwd_pagedkv_traits_
 {
     static constexpr ck_tile::index_t HDim           = HDim_;
@@ -1305,6 +1320,7 @@ struct fmha_fwd_pagedkv_traits_
     static constexpr bool kPadD                      = kPadD_;
     static constexpr bool kPadDv                     = kPadDv_;
     static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
+    static constexpr bool kHasSink                   = kHasSink_;
 };
 
 template <typename Traits_, typename Arch = void>
@@ -1327,6 +1343,7 @@ template <ck_tile::index_t HDim_,
           bool kStoreLse_,
           bool kDoFp8StaticQuant_,
           bool kIsPagedKV_,
+          bool kHasSink_,
           bool kPadS_,
           bool kPadSK_,
           bool kPadD_,
@@ -1354,6 +1371,7 @@ struct fmha_fwd_splitkv_traits_
     static constexpr bool kPadD                      = kPadD_;
     static constexpr bool kPadDv                     = kPadDv_;
     static constexpr bool kIsPagedKV                 = kIsPagedKV_;
+    static constexpr bool kHasSink                   = kHasSink_;
 };
 
 template <typename Traits_, typename Arch = void>
@@ -1440,6 +1458,7 @@ struct fmha_fwd_traits
     bool has_dropout;
     quant_scale_enum qscale_type;
     bool skip_min_seqlen_q = false;
+    bool has_sink          = false;
     // TODO: padding check is inside this api
 };
 float fmha_fwd(fmha_fwd_traits, fmha_fwd_args, const ck_tile::stream_config&);
@@ -1458,6 +1477,7 @@ struct fmha_fwd_pagedkv_traits
     bool use_pagedkv         = true;
     bool do_fp8_static_quant = false;
     bool skip_min_seqlen_q   = false;
+    bool has_sink            = false;
     // TODO: padding check is inside this api
 };
 
@@ -1477,6 +1497,7 @@ struct fmha_fwd_splitkv_traits
     bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
     bool has_lse;
     bool do_fp8_static_quant;
+    bool has_sink = false;
     // TODO: padding check is inside this api
 };
 float fmha_fwd_splitkv(fmha_fwd_splitkv_traits,
diff --git a/example/ck_tile/01_fmha/fmha_fwd_runner.hpp b/example/ck_tile/01_fmha/fmha_fwd_runner.hpp
index bca4c60bc6..536fcb0692 100644
--- a/example/ck_tile/01_fmha/fmha_fwd_runner.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_runner.hpp
@@ -879,6 +879,7 @@ fwd_result fmha_fwd_run(mode_enum mode,
             traits.has_logits_soft_cap = 0.f < logits_soft_cap;
             traits.mask_type           = mask.type;
             traits.bias_type           = bias.type;
+            traits.has_sink            = mask.sink > 0 ? true : false;
             traits.has_lse             = lse;
 
             if constexpr(std::is_same_v<fmha_fwd_traits, std::decay_t<decltype(traits)>>)
@@ -1042,6 +1043,7 @@ fwd_result fmha_fwd_run(mode_enum mode,
 
             args.window_size_left  = mask.left;
             args.window_size_right = mask.right;
+            args.sink_size         = mask.sink;
             args.mask_type         = static_cast<ck_tile::index_t>(mask.type);
 
             if constexpr(std::is_same_v<fmha_fwd_args, std::decay_t<decltype(args)>>)
@@ -1645,7 +1647,7 @@ fwd_result fmha_fwd_run(mode_enum mode,
                 ck_tile::reference_batched_masking<SaccDataType>(
                     s_host_ref,
                     ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
-                        mask.left, mask.right, real_seqlen_q, real_seqlen_k));
+                        mask.left, mask.right, mask.sink, real_seqlen_q, real_seqlen_k));
             }
             else
             {
@@ -1657,6 +1659,7 @@ fwd_result fmha_fwd_run(mode_enum mode,
                         ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::CausalMask>(
                             mask.left,
                             mask.right,
+                            mask.sink,
                             real_seqlen_q,
                             real_seqlen_k,
                             mask.type == mask_enum::mask_top_left));
@@ -1666,6 +1669,7 @@ fwd_result fmha_fwd_run(mode_enum mode,
                         ck_tile::make_generic_attention_mask_from_lr_window<FmhaMasks::GenericMask>(
                             mask.left,
                             mask.right,
+                            mask.sink,
                             real_seqlen_q,
                             real_seqlen_k,
                             mask.type == mask_enum::mask_top_left));
diff --git a/example/ck_tile/01_fmha/mask.hpp b/example/ck_tile/01_fmha/mask.hpp
index 32157a2245..f85b811116 100644
--- a/example/ck_tile/01_fmha/mask.hpp
+++ b/example/ck_tile/01_fmha/mask.hpp
@@ -25,6 +25,7 @@ struct mask_info
     ck_tile::index_t seqlen_k;
     ck_tile::index_t y, x;
     ck_tile::index_t left, right; // FA style SWA left/right
+    ck_tile::index_t sink;
 
     void serialize(std::ostream& os) const
     {
@@ -58,13 +59,14 @@ struct mask_info
                 ck_tile::index_t window_size = std::stoi(v);
                 ck_tile::index_t left_size   = -1;
                 ck_tile::index_t right_size  = 0;
+                ck_tile::index_t sink_size   = 0;
                 if(window_size > 0)
                 {
                     left_size  = window_size / 2;
                     right_size = window_size - 1 - left_size;
                 }
                 auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
-                    left_size, right_size, y_total, x_total, t == "xt");
+                    left_size, right_size, sink_size, y_total, x_total, t == "xt");
 
                 tmp.type  = t == "xt" ? mask_enum::mask_top_left : mask_enum::mask_bottom_right;
                 tmp.y     = r.at(ck_tile::number<0>{});
@@ -79,27 +81,54 @@ struct mask_info
                 {
                     throw std::invalid_argument("invalid mask value: " + str);
                 }
-                ck_tile::index_t v0 = std::stoi(v.substr(0, found_1));
-                ck_tile::index_t v1 = std::stoi(v.substr(found_1 + 1));
+                tmp.type              = mask_enum::window_generic;
+                ck_tile::index_t v0   = atoi(v.substr(0, found_1).c_str());
+                auto found_2          = v.find(',', found_1 + 1);
+                ck_tile::index_t v1   = 0;
+                ck_tile::index_t sink = 0;
+                // ck_tile::index_t v1 = atoi(v.substr(found_1 + 1).c_str());
+                // TODO: some validation
                 if(t == "t")
                 {
+                    if(found_2 != std::string::npos)
+                    {
+                        v1   = atoi(v.substr(found_1 + 1, found_2 - found_1 - 1).c_str());
+                        sink = atoi(v.substr(found_2 + 1).c_str());
+                    }
+                    else
+                    {
+                        v1   = atoi(v.substr(found_1 + 1).c_str());
+                        sink = 0;
+                    }
                     tmp.type = mask_enum::mask_top_left;
                     auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
-                        v0, v1, y_total, x_total, true);
+                        v0, v1, sink, y_total, x_total, true);
                     tmp.y     = r.at(ck_tile::number<0>{});
                     tmp.x     = r.at(ck_tile::number<1>{});
                     tmp.left  = v0;
                     tmp.right = v1;
+                    tmp.sink  = sink;
                 }
                 else if(t == "b")
                 {
+                    if(found_2 != std::string::npos)
+                    {
+                        v1   = atoi(v.substr(found_1 + 1, found_2 - found_1 - 1).c_str());
+                        sink = atoi(v.substr(found_2 + 1).c_str());
+                    }
+                    else
+                    {
+                        v1   = atoi(v.substr(found_1 + 1).c_str());
+                        sink = 0;
+                    }
                     tmp.type = mask_enum::mask_bottom_right;
                     auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
-                        v0, v1, y_total, x_total, false);
+                        v0, v1, sink, y_total, x_total, false);
                     tmp.y     = r.at(ck_tile::number<0>{});
                     tmp.x     = r.at(ck_tile::number<1>{});
                     tmp.left  = v0;
                     tmp.right = v1;
+                    tmp.sink  = sink;
                 }
                 else if(t == "g")
                 {
@@ -108,6 +137,7 @@ struct mask_info
                     tmp.x     = v1;
                     tmp.left  = v0; // TODO: don't use this?
                     tmp.right = v1;
+                    tmp.sink  = 0;
                 }
             }
             else
@@ -126,6 +156,7 @@ struct mask_info
             tmp.x     = 1;
             tmp.left  = -1;
             tmp.right = 0;
+            tmp.sink  = 0;
         }
         else if(str == "2" || str == "b")
         {
@@ -134,6 +165,7 @@ struct mask_info
             tmp.x     = seqlen_k - seqlen_q + 1;
             tmp.left  = -1;
             tmp.right = 0;
+            tmp.sink  = 0;
         }
         else
         {
diff --git a/example/ck_tile/01_fmha/script/correct_test_fwd_sink.sh b/example/ck_tile/01_fmha/script/correct_test_fwd_sink.sh
new file mode 100644
index 0000000000..de3bff25ed
--- /dev/null
+++ b/example/ck_tile/01_fmha/script/correct_test_fwd_sink.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+# mode=0
+# export HIP_VISIBLE_DEVICES=4
+
+TEST_SPLITKV=0
+TEST_APPENDKV=0
+# options:
+#    -s: run splitkv tests
+#    -a: run appendkv tests
+while getopts ":sa" opt; do
+    case "${opt}" in
+        s)
+            TEST_SPLITKV=1
+            ;;
+        a)
+            TEST_APPENDKV=1
+            ;;
+        *)
+            ;;
+    esac
+done
+
+run_fp16_bf16_tests() {
+    local NUM_SPLITS="1"
+    local PAGE_BLOCK_SIZE="0"
+    local CACHE_BATCH_IDX="0"
+
+    if [ $TEST_SPLITKV -eq 1 ] ; then
+        NUM_SPLITS="$NUM_SPLITS 2 3"
+        PAGE_BLOCK_SIZE="$PAGE_BLOCK_SIZE 128"
+        CACHE_BATCH_IDX="$CACHE_BATCH_IDX 1"
+    fi
+
+    for prec in "fp16"; do 
+    for mode in 1 0 ; do
+    for perm in 0 1 ; do
+    for vlayout in "r" "c" ; do
+    for batch in 1 4; do
+    for head in 1; do
+    for h_k in 1; do
+    for q_seq in 128 512 ; do
+    for kv_seq in 128 1024; do
+    for hdim in 32 64 128 256; do #256 
+    for lse in 0 1 ; do
+    for bias in "e" ; do
+    for p_drop in 0.0 0.2; do # 0.0   
+    for mask in "t:2,0,4" "b:1,0,2"; do
+    for num_splits in $NUM_SPLITS ; do
+    for page_block_size in $PAGE_BLOCK_SIZE ; do
+    for cache_batch_idx in $CACHE_BATCH_IDX ; do
+
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=$batch -h=$head -h_k=$h_k -d=16 -d_v=$hdim -s=$q_seq -s_k=$kv_seq -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS -mask=$mask
+
+    done ; done ; done ; done ; done
+    done ; done ; done ; done ; done
+    done ; done ; done ; done ; done
+    done ; done
+}
+
+
+set -x
+
+run_fp16_bf16_tests
+
+set +x
diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh
index 4fbde37cae..456c3986fa 100755
--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
@@ -39,6 +39,7 @@ function print_log_header(){
 #run verification tests
 time example/ck_tile/01_fmha/script/smoke_test_fwd.sh
 time example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+time example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
 
 #run performance benchmarks
 export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
new file mode 100755
index 0000000000..664c825418
--- /dev/null
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+#EXE="/code/composable_kernel/build/bin/tile_example_fmha_fwd"
+set -euo pipefail
+
+SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+EXE_NAME=tile_example_fmha_fwd
+EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
+KNAME=1
+GPU_arch=$GPU_arch
+if [ -z "$GPU_arch" ] ; then
+    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
+fi
+set -x
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512 -s_k=512 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:2,0,2
+
+# window_size[2,0], sink_size = 2
+
+#    x=1/y=3                 
+#    1 * * * * * * *           1 * * * * * * *  
+#    1 1 * * * * * *           1 1 * * * * * *
+#    1 1 1 * * * * *   ---->   1 1 1 * * * * * 
+#    * 1 1 1 * * * *           1 1 1 1 * * * * 
+#    * * 1 1 1 * * *           1 1 1 1 1 * * * 
+#    * * * 1 1 1 * *           1 1 * 1 1 1 * * 
+#    * * * * 1 1 1 *           1 1 * * 1 1 1 *
+#    * * * * * 1 1 1           1 1 * * * 1 1 1
+#    l=2/r=0(tl)               l=2/r=0/s=2(tl)
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:0,3,2 #-mask=b:3,0,2
+
+#    x=4/y=1                   
+#    1 1 1 1 * * * *           1 1 1 1 * * * * 
+#    * 1 1 1 1 * * *           1 1 1 1 1 * * *
+#    * * 1 1 1 1 * *   ---->   1 1 1 1 1 1 * *
+#    * * * 1 1 1 1 *           1 1 * 1 1 1 1 *
+#    * * * * 1 1 1 1           1 1 * * 1 1 1 1 
+#    l=0/r=3(tl)               l=0/r=3/s=2(tl)
+#    l=3/r=0(br)               l=3/r=0/s=2(br)  
+
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096 -s_k=4096 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:1,0,2
+
+#    x=4/y=-1          
+#    * * 1 1 * * * *           1 1 1 1 * * * * 
+#    * * * 1 1 * * *           1 1 * 1 1 * * *
+#    * * * * 1 1 * *   ---->   1 1 * * 1 1 * *
+#    * * * * * 1 1 *           1 1 * * * 1 1 *
+#    * * * * * * 1 1           1 1 * * * * 1 1 
+#    l=1/r=0(br)               l=1/r=0/s=2(br)
+
+
+$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192 -s_k=8192 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:2,0,2
+
+#    x=-1/y=5 
+     
+#    * * * * * *               * * * * * *
+#    * * * * * *               * * * * * *
+#    1 * * * * *               1 * * * * *
+#    1 1 * * * *               1 1 * * * *
+#    1 1 1 * * *       ---->   1 1 1 * * *
+#    * 1 1 1 * *               1 1 1 1 * *
+#    * * 1 1 1 *               1 1 1 1 1 *  
+#    * * * 1 1 1               1 1 * 1 1 1
+#    l=2/r=0(br)               l=2/r=0/s=2(br)
+
+
+$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=16384 -s_k=16384 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:-1,1,2
+#      x=-1/y=8
+#    * * * * *               * * * * *    
+#    * * * * *               * * * * * 
+#    1 * * * *      ---->    1 * * * * 
+#    1 1 * * *               1 1 * * * 
+#    1 1 1 * *               1 1 1 * * 
+#    1 1 1 1 *               1 1 1 1 * 
+#    1 1 1 1 1               1 1 1 1 1 
+#    1 1 1 1 1               1 1 1 1 1 
+#    l=2/r=0(br)             l=2/r=0/s=2(br)
+     
diff --git a/include/ck_tile/host/reference/reference_batched_masking.hpp b/include/ck_tile/host/reference/reference_batched_masking.hpp
index c2dd8abe23..a172a0013e 100644
--- a/include/ck_tile/host/reference/reference_batched_masking.hpp
+++ b/include/ck_tile/host/reference/reference_batched_masking.hpp
@@ -20,7 +20,7 @@ CK_TILE_HOST void reference_batched_masking(HostTensor<CDataType>& c_b_m_n, cons
         {
             for(int m = 0; m < M; ++m)
             {
-                if(mask.IsOutOfBound(m, n))
+                if(mask.IsOutOfSinkBound(m, n))
                     c_b_m_n(batch, m, n) = -ck_tile::numeric<CDataType>::infinity();
             }
         }
diff --git a/include/ck_tile/ops/fmha/block/block_masking.hpp b/include/ck_tile/ops/fmha/block/block_masking.hpp
index 756968871d..4ffb303812 100644
--- a/include/ck_tile/ops/fmha/block/block_masking.hpp
+++ b/include/ck_tile/ops/fmha/block/block_masking.hpp
@@ -86,21 +86,22 @@ struct GenericAttentionMask
     static constexpr const char* name = impl::MaskName<IsMasking, IsLocal>::name;
 
     CK_TILE_HOST_DEVICE GenericAttentionMask(index_t y_total_, index_t x_total_)
-        : GenericAttentionMask(0, 0, y_total_, x_total_)
+        : GenericAttentionMask(0, 0, 0, y_total_, x_total_)
     {
     }
 
     CK_TILE_HOST_DEVICE
-    GenericAttentionMask(index_t y_, index_t x_, index_t y_total_, index_t x_total_)
-        : y(y_), x(x_), y_total(y_total_), x_total(x_total_)
+    GenericAttentionMask(index_t y_, index_t x_, index_t sink_, index_t y_total_, index_t x_total_)
+        : y(y_), x(x_), sink(sink_), y_total(y_total_), x_total(x_total_)
     {
     }
     template <typename MaskCoordinates>
     CK_TILE_HOST_DEVICE GenericAttentionMask(const MaskCoordinates& mask_coord)
         : y(mask_coord.at(number<0>{})),
           x(mask_coord.at(number<1>{})),
-          y_total(mask_coord.at(number<2>{})),
-          x_total(mask_coord.at(number<3>{}))
+          sink(mask_coord.at(number<2>{})),
+          y_total(mask_coord.at(number<3>{})),
+          x_total(mask_coord.at(number<4>{}))
     {
     }
 
@@ -141,6 +142,44 @@ struct GenericAttentionMask
         }
     }
 
+    template <index_t YTile, index_t XTile>
+    CK_TILE_HOST_DEVICE constexpr auto
+    GetSinkTileRangeAlongX(index_t i_y, number<YTile>, number<XTile>) const
+    {
+        if constexpr(!IsMasking)
+        {
+            return ck_tile::make_tuple(0, 0, x_total);
+        }
+        else
+        {
+            // get the tile start/end range assum we loop over along X tile by tile
+            index_t x_start = [&]() {
+                if constexpr(IsLocal)
+                {
+                    index_t tmp = max(-y + i_y + 1, 0);
+                    return (tmp / XTile) * XTile; // round to tile aligned
+                }
+                else
+                {
+                    return 0;
+                }
+            }();
+
+            // TODO: end could be negative, we ignore clamp here, and let caller to check
+            //      ... in which case end-start is negative
+            index_t x_end = [&]() {
+                index_t tmp = min(i_y + YTile - 1 + x, x_total);
+                return ((tmp + XTile - 1) / XTile) * XTile;
+            }();
+
+            index_t sink_seq_end = sink > 0 ? ((sink + XTile - 1) / XTile) * XTile : 0;
+            if(x_start <= sink_seq_end && sink > 0)
+                return ck_tile::make_tuple(0, 0, x_end);
+            else
+                return ck_tile::make_tuple(sink_seq_end, x_start, x_end);
+        }
+    }
+
     // to get the loop length along Y axis, return index:[start, end), end-start=length
     // use this if need loop over Y axis tile by tile (like q-seqlen loopover)
     // TODO: y_end still could be negative, so end-start could be negative(need check)
@@ -195,6 +234,30 @@ struct GenericAttentionMask
         }
     }
 
+    CK_TILE_HOST_DEVICE constexpr auto IsOutOfSinkBound(index_t i_y, index_t i_x) const
+    {
+        if constexpr(!IsMasking)
+            return i_x >= x_total;
+        // no need to do min/max here, since i_x will never be < 0 or >= x_total
+        index_t x_start = -y + i_y + 1;
+        index_t x_end   = min(i_y + x, x_total);
+
+        if constexpr(IsLocal)
+        {
+            if((i_x < sink) && (y < y_total) && ((i_y + x) > 1) && i_y < x_total)
+                return false;
+            else
+                return i_x < x_start || i_x >= x_end;
+        }
+        else
+        {
+            if((i_x < sink) && (y < y_total) && ((i_y + x) > 1) && i_y < x_total)
+                return false;
+            else
+                return i_x >= x_end || i_y >= y_total;
+        }
+    }
+
     // if current tile is at the edge, means need per-pixel mask check.
     // otherwise no need to check per-pixel
     // Attention! assume the idex passed in this function is with in range of GetTileRangeAlongX/Y()
@@ -237,7 +300,7 @@ struct GenericAttentionMask
     }
 
     private:
-    index_t y, x;
+    index_t y, x, sink;
     index_t y_total, x_total;
 };
 
@@ -260,21 +323,23 @@ struct SimplifiedGenericAttentionMask
     static constexpr const char* name = impl::SimplifiedMaskName<IsMasking>::name;
 
     CK_TILE_HOST_DEVICE SimplifiedGenericAttentionMask(index_t y_total_, index_t x_total_)
-        : SimplifiedGenericAttentionMask(0, 0, y_total_, x_total_)
+        : SimplifiedGenericAttentionMask(0, 0, 0, y_total_, x_total_)
     {
     }
 
     CK_TILE_HOST_DEVICE
-    SimplifiedGenericAttentionMask(index_t y_, index_t x_, index_t y_total_, index_t x_total_)
-        : y(y_), x(x_), y_total(y_total_), x_total(x_total_)
+    SimplifiedGenericAttentionMask(
+        index_t y_, index_t x_, index_t sink_, index_t y_total_, index_t x_total_)
+        : y(y_), x(x_), sink(sink_), y_total(y_total_), x_total(x_total_)
     {
     }
     template <typename MaskCoordinates>
     CK_TILE_HOST_DEVICE SimplifiedGenericAttentionMask(const MaskCoordinates& mask_coord)
         : y(mask_coord.at(number<0>{})),
           x(mask_coord.at(number<1>{})),
-          y_total(mask_coord.at(number<2>{})),
-          x_total(mask_coord.at(number<3>{}))
+          sink(mask_coord.at(number<2>{})),
+          y_total(mask_coord.at(number<3>{})),
+          x_total(mask_coord.at(number<4>{}))
     {
     }
 
@@ -308,6 +373,38 @@ struct SimplifiedGenericAttentionMask
         }
     }
 
+    template <index_t YTile, index_t XTile>
+    CK_TILE_HOST_DEVICE constexpr auto
+    GetSinkTileRangeAlongX(index_t i_y, number<YTile>, number<XTile>) const
+    {
+        if constexpr(!IsMasking)
+        {
+            return ck_tile::make_tuple(0, 0, x_total);
+        }
+        else
+        {
+            // get the tile start/end range assum we loop over along X tile by tile
+            index_t x_start = [&]() {
+                index_t tmp = max(-y + i_y + 1, 0);
+                return (tmp / XTile) * XTile; // round to tile aligned
+            }();
+
+            // TODO: end could be negative, we ignore clamp here, and let caller to check
+            //      ... in which case end-start is negative
+            index_t x_end = [&]() {
+                index_t tmp = min(i_y + YTile - 1 + x, x_total);
+                return ((tmp + XTile - 1) / XTile) * XTile;
+            }();
+
+            index_t sink_seq_end = sink > 0 ? ((sink + XTile - 1) / XTile) * XTile : 0;
+
+            if(x_start <= sink_seq_end && sink > 0)
+                return ck_tile::make_tuple(0, 0, x_end);
+            else
+                return ck_tile::make_tuple(sink_seq_end, x_start, x_end);
+        }
+    }
+
     template <index_t TileHeight, index_t TileWidth>
     CK_TILE_HOST_DEVICE constexpr auto GetTileRangeAlongX(index_t i_y,
                                                           number<TileHeight> height,
@@ -325,6 +422,29 @@ struct SimplifiedGenericAttentionMask
                                    ck_tile::min(origin_end, split_end));
     }
 
+    template <index_t TileHeight, index_t TileWidth>
+    CK_TILE_HOST_DEVICE constexpr auto GetSinkTileRangeAlongX(index_t i_y,
+                                                              number<TileHeight> height,
+                                                              number<TileWidth> width,
+                                                              index_t num_splits,
+                                                              index_t i_split) const
+    {
+        auto [origin_start, origin_end] = GetTileRangeAlongX(i_y, height, width);
+        const index_t x_per_split       = ck_tile::max(1, integer_divide_ceil(x_total, num_splits));
+        const index_t split_start       = x_per_split * i_split;                            // 128
+        const index_t split_end         = ck_tile::min(x_total, split_start + x_per_split); // 256
+        const index_t sink_seq_end      = sink > 0 ? ((sink + width - 1) / width) * width : 0;
+        const index_t start             = ck_tile::max(origin_start, split_start);
+        const index_t end               = ck_tile::min(origin_end, split_end);
+        const bool is_first_intersecting_split =
+            (split_start <= origin_start && split_end >= origin_start);
+        const bool sink_in_range = (sink_seq_end <= start);
+
+        const index_t sink_offset =
+            (is_first_intersecting_split && sink_in_range) ? sink_seq_end : 0;
+        return ck_tile::make_tuple(sink_offset, start, end);
+    }
+
     // to get the loop length along Y axis, return index:[start, end), end-start=length
     // use this if need loop over Y axis tile by tile (like q-seqlen loopover)
     // TODO: y_end still could be negative, so end-start could be negative(need check)
@@ -368,11 +488,22 @@ struct SimplifiedGenericAttentionMask
         {
             index_t x_start = -y + i_y + 1;          // this could be negative, but it's fine
             index_t x_end   = min(i_y + x, x_total); // need min in case x is padded
-
             return i_x < x_start || i_x >= x_end || i_y >= y_total;
         }
     }
 
+    CK_TILE_HOST_DEVICE constexpr auto IsOutOfSinkBound(index_t i_y, index_t i_x) const
+    {
+        if constexpr(!IsMasking)
+            return i_x >= x_total;
+        index_t x_start = -y + i_y + 1;          // this could be negative, but it's fine
+        index_t x_end   = min(i_y + x, x_total); // need min in case x is padded
+        if((i_x < sink) && (y < y_total) && ((i_y + x) > 1) && i_y < x_total)
+            return false;
+        else
+            return i_x < x_start || i_x >= x_end || i_y >= y_total;
+    }
+
     // if current tile is at the edge, means need per-pixel mask check.
     // otherwise no need to check per-pixel
     // Attention! assume the idex passed in this function is with in range of GetTileRangeAlongX/Y()
@@ -406,7 +537,7 @@ struct SimplifiedGenericAttentionMask
     }
 
     private:
-    index_t y, x;
+    index_t y, x, sink;
     index_t y_total, x_total;
 };
 
@@ -620,6 +751,7 @@ static constexpr bool is_generic_attention_mask_v = is_generic_attention_mask<Ma
 CK_TILE_HOST_DEVICE constexpr auto
 make_generic_attention_mask_coordinates_from_lr_window(index_t left_size,
                                                        index_t right_size,
+                                                       index_t sink_size,
                                                        index_t y_total,
                                                        index_t x_total,
                                                        bool is_top_left = true)
@@ -637,7 +769,21 @@ make_generic_attention_mask_coordinates_from_lr_window(index_t left_size,
     index_t x = 1 + right_size + x_tmp;
     index_t y = 1 + left_size + y_tmp;
 
-    return ck_tile::make_tuple(y, x, y_total, x_total);
+    return ck_tile::make_tuple(y, x, sink_size, y_total, x_total);
+}
+
+template <typename MaskType>
+CK_TILE_HOST_DEVICE constexpr auto
+make_generic_attention_mask_from_lr_window(index_t left_size,
+                                           index_t right_size,
+                                           index_t sink_size,
+                                           index_t y_total,
+                                           index_t x_total,
+                                           bool is_top_left = true)
+{
+    auto r = make_generic_attention_mask_coordinates_from_lr_window(
+        left_size, right_size, sink_size, y_total, x_total, is_top_left);
+    return MaskType{r.at(number<0>{}), r.at(number<1>{}), sink_size, y_total, x_total};
 }
 
 template <typename MaskType>
@@ -649,7 +795,7 @@ make_generic_attention_mask_from_lr_window(index_t left_size,
                                            bool is_top_left = true)
 {
     auto r = make_generic_attention_mask_coordinates_from_lr_window(
-        left_size, right_size, y_total, x_total, is_top_left);
-    return MaskType{r.at(number<0>{}), r.at(number<1>{}), y_total, x_total};
+        left_size, right_size, 0, y_total, x_total, is_top_left);
+    return MaskType{r.at(number<0>{}), r.at(number<1>{}), 0, y_total, x_total};
 }
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/block/variants.hpp b/include/ck_tile/ops/fmha/block/variants.hpp
index 29d9cf2a8e..b6f79873b4 100644
--- a/include/ck_tile/ops/fmha/block/variants.hpp
+++ b/include/ck_tile/ops/fmha/block/variants.hpp
@@ -162,6 +162,17 @@ struct StandardAttention
     {
         return !params.impl_mask.IsOutOfBound(qo_idx, kv_idx);
     }
+
+    template <typename Params>
+    __device__ __forceinline__ bool LogitsSinkMask(const Params& params,
+                                                   [[maybe_unused]] uint32_t batch_idx,
+                                                   uint32_t qo_idx,
+                                                   uint32_t kv_idx,
+                                                   [[maybe_unused]] uint32_t qo_head_idx,
+                                                   [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        return !params.impl_mask.IsOutOfSinkBound(qo_idx, kv_idx);
+    }
 };
 
 template <bool UseExp2 = false>
@@ -224,6 +235,17 @@ struct LogitsSoftCap
     {
         return !params.impl_mask.IsOutOfBound(qo_idx, kv_idx);
     }
+
+    template <typename Params>
+    __device__ __forceinline__ bool LogitsSinkMask(const Params& params,
+                                                   [[maybe_unused]] uint32_t batch_idx,
+                                                   uint32_t qo_idx,
+                                                   uint32_t kv_idx,
+                                                   [[maybe_unused]] uint32_t qo_head_idx,
+                                                   [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        return !params.impl_mask.IsOutOfSinkBound(qo_idx, kv_idx);
+    }
 };
 
 constexpr uint32_t CUSTOM_MASK     = 1U;
@@ -297,6 +319,17 @@ struct ComposedAttention
     {
         return !params.impl_mask.IsOutOfBound(qo_idx, kv_idx);
     }
+
+    template <typename Params>
+    __device__ __forceinline__ bool LogitsSinkMask(const Params& params,
+                                                   [[maybe_unused]] uint32_t batch_idx,
+                                                   uint32_t qo_idx,
+                                                   uint32_t kv_idx,
+                                                   [[maybe_unused]] uint32_t qo_head_idx,
+                                                   [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        return !params.impl_mask.IsOutOfSinkBound(qo_idx, kv_idx);
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
index e63ad8252b..10b5d0120e 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -200,7 +200,7 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
     struct FmhaFwdMaskKargs
     {
         // ck_tile::index_t window_size_left, window_size_right;
-        ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::index_t window_size_left, window_size_right, sink_size;
         ck_tile::GenericAttentionMaskEnum mask_type;
     };
 
@@ -356,6 +356,7 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
               ck_tile::index_t batch_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
@@ -418,6 +419,7 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kStoreLSE)
@@ -497,6 +499,7 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
               ck_tile::index_t batch_stride_v,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
@@ -557,6 +560,7 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kStoreLSE)
@@ -1008,6 +1012,7 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
                 return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
                     kargs.window_size_left,
                     kargs.window_size_right,
+                    kargs.sink_size,
                     kargs.seqlen_q,
                     kargs.seqlen_k,
                     kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 9890d1f2e4..9160e79af6 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -58,6 +58,7 @@ struct FmhaFwdKernel
     static constexpr bool kHasDropout       = FmhaPipeline::kHasDropout;
     static constexpr auto QScaleEnum        = FmhaPipeline::Problem::QScaleEnum;
     static constexpr bool kSkipMinSeqlenQ   = FmhaPipeline::Problem::kSkipMinSeqlenQ;
+    static constexpr bool kHasSink          = FmhaPipeline::kHasSink;
 
     using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
     using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
@@ -155,7 +156,7 @@ struct FmhaFwdKernel
     struct FmhaFwdMaskKargs
     {
         // ck_tile::index_t window_size_left, window_size_right;
-        ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::index_t window_size_left, window_size_right, sink_size;
         ck_tile::GenericAttentionMaskEnum mask_type;
     };
 
@@ -335,6 +336,7 @@ struct FmhaFwdKernel
                   ck_tile::index_t batch_stride_o,
                   ck_tile::index_t window_size_left,
                   ck_tile::index_t window_size_right,
+                  ck_tile::index_t sink_size,
                   ck_tile::index_t mask_type,
                   float p_drop,
                   bool s_randval,
@@ -393,6 +395,7 @@ struct FmhaFwdKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kStoreLSE)
@@ -481,6 +484,7 @@ struct FmhaFwdKernel
               ck_tile::index_t batch_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
@@ -529,6 +533,7 @@ struct FmhaFwdKernel
             batch_stride_o,
             window_size_left,
             window_size_right,
+            sink_size,
             mask_type,
             p_drop,
             s_randval,
@@ -580,6 +585,7 @@ struct FmhaFwdKernel
               ck_tile::index_t batch_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
@@ -628,6 +634,7 @@ struct FmhaFwdKernel
             batch_stride_o,
             window_size_left,
             window_size_right,
+            sink_size,
             mask_type,
             p_drop,
             s_randval,
@@ -673,6 +680,7 @@ struct FmhaFwdKernel
                   ck_tile::index_t nhead_stride_o,
                   ck_tile::index_t window_size_left,
                   ck_tile::index_t window_size_right,
+                  ck_tile::index_t sink_size,
                   ck_tile::index_t mask_type,
                   ck_tile::index_t min_seqlen_q,
                   float p_drop,
@@ -732,6 +740,7 @@ struct FmhaFwdKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kStoreLSE)
@@ -817,6 +826,7 @@ struct FmhaFwdKernel
               ck_tile::index_t nhead_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type,
               ck_tile::index_t min_seqlen_q,
               float p_drop,
@@ -861,6 +871,7 @@ struct FmhaFwdKernel
             nhead_stride_o,
             window_size_left,
             window_size_right,
+            sink_size,
             mask_type,
             min_seqlen_q,
             p_drop,
@@ -908,6 +919,7 @@ struct FmhaFwdKernel
               ck_tile::index_t nhead_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type,
               ck_tile::index_t min_seqlen_q,
               float p_drop,
@@ -952,6 +964,7 @@ struct FmhaFwdKernel
             nhead_stride_o,
             window_size_left,
             window_size_right,
+            sink_size,
             mask_type,
             min_seqlen_q,
             p_drop,
@@ -1443,6 +1456,7 @@ struct FmhaFwdKernel
                     return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
                         kargs.window_size_left,
                         kargs.window_size_right,
+                        kargs.sink_size,
                         kargs.seqlen_q,
                         kargs.seqlen_k,
                         kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
@@ -2182,6 +2196,7 @@ struct FmhaFwdKernel
                     return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
                         kargs.window_size_left,
                         kargs.window_size_right,
+                        kargs.sink_size,
                         kargs.seqlen_q,
                         kargs.seqlen_k,
                         kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
index 677ead91ad..b75b35fc1e 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
@@ -55,6 +55,7 @@ struct FmhaFwdPagedKVKernel
     static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
     static constexpr bool kSkipMinSeqlenQ   = FmhaPipeline::Problem::kSkipMinSeqlenQ;
     static constexpr bool kIsPagedKV        = FmhaPipeline::Problem::kIsPagedKV;
+    static constexpr bool kHasSink          = FmhaPipeline::kHasSink;
 
     using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
     using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
@@ -101,7 +102,7 @@ struct FmhaFwdPagedKVKernel
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
             (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" )  + (kSkipMinSeqlenQ ? "_skip" : "_nskip" )  + (kDoFp8StaticQuant ? "_squant" : "_nsquant" ) + (kIsPagedKV ? "_pagedkv" : "_npagedkv" );
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" )  + (kSkipMinSeqlenQ ? "_skip" : "_nskip" )  + (kDoFp8StaticQuant ? "_squant" : "_nsquant" ) + (kIsPagedKV ? "_pagedkv" : "_npagedkv" ) + (kHasSink ? "_sink" : "_nsink" );
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -189,7 +190,7 @@ struct FmhaFwdPagedKVKernel
     struct FmhaFwdMaskKargs
     {
         // ck_tile::index_t window_size_left, window_size_right;
-        ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::index_t window_size_left, window_size_right, sink_size;
         ck_tile::GenericAttentionMaskEnum mask_type;
     };
 
@@ -326,6 +327,7 @@ struct FmhaFwdPagedKVKernel
                   ck_tile::index_t batch_stride_o,
                   ck_tile::index_t window_size_left,
                   ck_tile::index_t window_size_right,
+                  ck_tile::index_t sink_size,
                   ck_tile::index_t mask_type)
     {
         Kargs kargs{{q_ptr,
@@ -379,6 +381,7 @@ struct FmhaFwdPagedKVKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kStoreLSE)
@@ -453,6 +456,7 @@ struct FmhaFwdPagedKVKernel
               ck_tile::index_t batch_stride_o,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type)
     {
         return MakeKargsImpl(q_ptr,
@@ -495,6 +499,7 @@ struct FmhaFwdPagedKVKernel
                              batch_stride_o,
                              window_size_left,
                              window_size_right,
+                             sink_size,
                              mask_type);
     }
 
@@ -536,6 +541,7 @@ struct FmhaFwdPagedKVKernel
                   ck_tile::index_t batch_stride_v, // only used for paged-kvcache
                   ck_tile::index_t window_size_left,
                   ck_tile::index_t window_size_right,
+                  ck_tile::index_t sink_size,
                   ck_tile::index_t mask_type,
                   ck_tile::index_t min_seqlen_q)
     {
@@ -590,6 +596,7 @@ struct FmhaFwdPagedKVKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kStoreLSE)
@@ -660,6 +667,7 @@ struct FmhaFwdPagedKVKernel
               ck_tile::index_t batch_stride_v, // only used for paged-kvcache
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type,
               ck_tile::index_t min_seqlen_q)
     {
@@ -699,6 +707,7 @@ struct FmhaFwdPagedKVKernel
                              batch_stride_v,
                              window_size_left,
                              window_size_right,
+                             sink_size,
                              mask_type,
                              min_seqlen_q);
     }
@@ -1257,6 +1266,7 @@ struct FmhaFwdPagedKVKernel
                 return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
                     kargs.window_size_left,
                     kargs.window_size_right,
+                    kargs.sink_size,
                     kargs.seqlen_q,
                     kargs.seqlen_k,
                     kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 19592e8bf4..bd5cddb526 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -51,6 +51,7 @@ struct FmhaFwdSplitKVKernel
     static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
     static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
     static constexpr bool kIsPagedKV        = FmhaPipeline::Problem::kIsPagedKV;
+    static constexpr bool kHasSink          = FmhaPipeline::Problem::kHasSink;
     static constexpr bool kMergeNumHeadGroupsSeqLenQ =
         FmhaPipeline::Problem::kMergeNumHeadGroupsSeqLenQ;
     using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
@@ -101,7 +102,7 @@ struct FmhaFwdSplitKVKernel
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
             (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
             (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) +
-            (kDoFp8StaticQuant ? "_squant" : "_nsquant") + (kIsPagedKV ? "_pagedkv" : "_npagedkv" );
+            (kDoFp8StaticQuant ? "_squant" : "_nsquant") + (kIsPagedKV ? "_pagedkv" : "_npagedkv" ) + (kHasSink ? "_sink" : "_nsink" );
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -198,7 +199,7 @@ struct FmhaFwdSplitKVKernel
     struct MaskKargs
     {
         // ck_tile::index_t window_size_left, window_size_right;
-        ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::index_t window_size_left, window_size_right, sink_size;
         ck_tile::GenericAttentionMaskEnum mask_type;
     };
 
@@ -325,6 +326,7 @@ struct FmhaFwdSplitKVKernel
               ck_tile::index_t split_stride_o_acc,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type)
     {
         Kargs kargs{{q_ptr,
@@ -384,6 +386,7 @@ struct FmhaFwdSplitKVKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kDoFp8StaticQuant)
@@ -451,6 +454,7 @@ struct FmhaFwdSplitKVKernel
               ck_tile::index_t split_stride_o_acc,
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
+              ck_tile::index_t sink_size,
               ck_tile::index_t mask_type)
     {
         Kargs kargs{{q_ptr,
@@ -508,6 +512,7 @@ struct FmhaFwdSplitKVKernel
         {
             kargs.window_size_left  = window_size_left;
             kargs.window_size_right = window_size_right;
+            kargs.sink_size         = sink_size;
             kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
         }
         if constexpr(kDoFp8StaticQuant)
@@ -994,6 +999,7 @@ struct FmhaFwdSplitKVKernel
                 return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
                     kargs.window_size_left,
                     kargs.window_size_right,
+                    kargs.sink_size,
                     kargs.seqlen_q,
                     kargs.seqlen_k,
                     kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
index 693f81d08a..d55d0d9342 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
@@ -57,6 +57,7 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
     static constexpr auto BiasEnum          = Problem::BiasEnum;
     static constexpr bool kStoreLSE         = Problem::kStoreLSE;
     static constexpr bool kIsPagedKV        = Problem::kIsPagedKV;
+    static constexpr bool kHasSink          = Problem::kHasSink;
 
     static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
                    (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
@@ -228,10 +229,22 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
         clear_tile(o_acc);
         set_tile(m, -numeric<SMPLComputeDataType>::infinity());
         clear_tile(l);
-
-        const auto q_origin = q_dram_window.get_window_origin();
-        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
-            mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+        const auto q_origin          = q_dram_window.get_window_origin();
+        const auto tile_range_result = [&mask, &q_origin]() {
+            if constexpr(kHasSink)
+                return mask.GetSinkTileRangeAlongX(
+                    q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+            else
+            {
+                auto [start, end] =
+                    mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+                return ck_tile::make_tuple(0, start, end);
+            }
+        }();
+        const auto sink_seq_end           = tile_range_result.get(ck_tile::number<0>{});
+        const auto logical_seqlen_k_start = tile_range_result.get(ck_tile::number<1>{});
+        const auto logical_seqlen_k_end   = tile_range_result.get(ck_tile::number<2>{});
+        const auto num_sink_loop          = integer_divide_ceil(sink_seq_end, kN0);
 
         // check early exit if no work to do
         if constexpr(FmhaMask::IsMasking || kPadSeqLenK)
@@ -255,7 +268,6 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                 return o_acc;
             }
         }
-
         // k_dram_block_window
         const index_t physical_seqlen_k_start = logical_seqlen_k_start + kv_l2p_offset;
         const index_t physical_seqlen_k_end   = logical_seqlen_k_end + kv_l2p_offset;
@@ -274,27 +286,36 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                     return physical_seqlen_k_start_;
                 }
             }();
+        const auto kv_load_start = (sink_seq_end == 0 && aligned_physical_seqlen_k_start > 0)
+                                       ? aligned_physical_seqlen_k_start
+                                       : 0;
         const index_t num_total_loop =
-            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0) +
+            num_sink_loop;
 
         auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window(
-            k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0});
+            k_dram_block_window_lengths, {kv_load_start, 0});
+
+        const auto bias_origin      = bias_dram_block_window_tmp.get_window_origin();
+        const index_t bias_n_offset = [&]() {
+            if constexpr(kHasSink)
+                return kv_load_start;
+            else
+                return logical_seqlen_k_start -
+                       (physical_seqlen_k_start - aligned_physical_seqlen_k_start);
+        }();
 
-        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
         auto bias_dram_window =
             make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
-                             {bias_origin.at(number<0>{}),
-                              logical_seqlen_k_start - (physical_seqlen_k_start -
-                                                        aligned_physical_seqlen_k_start)}, // M/N
+                             {bias_origin.at(number<0>{}), bias_n_offset},
                              Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         // v_dram_window
         auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
             v_dram_block_window_lengths,
-            {0, aligned_physical_seqlen_k_start}, // TODO: hdim split?
+            {0, kv_load_start}, // TODO: hdim split?
             Policy::template MakeVDramTileDistribution<Problem>());
-
         auto q_tile = tile_elementwise_in(q_element_func, q);
 
         // prefetch K tile
@@ -321,9 +342,16 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                 store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
                 k_block_tile = load_tile(k_dram_window);
             }
+            const bool is_sink_tile  = ((num_sink_loop - 1) == i_total_loops);
+            const auto k_move_offset = [&]() {
+                if constexpr(kHasSink)
+                    return is_sink_tile ? logical_seqlen_k_start - sink_seq_end + kN0 : kN0;
+                else
+                    return kN0;
+            }();
             auto physical_next_block_id_k =
                 amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
-                    i_page_block_k, k_dram_block_window, {kN0, 0}));
+                    i_page_block_k, k_dram_block_window, {k_move_offset, 0}));
             auto physical_next_block_id_v = amd_wave_read_first_lane(
                 v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
 
@@ -442,7 +470,7 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
 #endif
                 }
             }
-            move_tile_window(bias_dram_window, {0, kN0});
+            move_tile_window(bias_dram_window, {0, k_move_offset});
 
             {
                 const auto k_origin = k_page_block_navigator.to_global_window_origin(
@@ -474,14 +502,29 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                                         number<kN0>{});
                     if(need_perpixel_check)
                     {
-                        set_tile_if(
-                            s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                                const auto row =
-                                    q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
-                                const auto col =
-                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                                return mask.IsOutOfBound(row, col - kv_l2p_offset);
+                        auto apply_mask = [&](auto&& mask_func) {
+                            set_tile_if(s_acc,
+                                        -numeric<SMPLComputeDataType>::infinity(),
+                                        [&](auto tile_idx) {
+                                            const auto row =
+                                                q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                                            const auto col =
+                                                k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                                            return mask_func(row, col - kv_l2p_offset);
+                                        });
+                        };
+
+                        if constexpr(kHasSink)
+                        {
+                            apply_mask([&](auto row, auto col) {
+                                return mask.IsOutOfSinkBound(row, col);
                             });
+                        }
+                        else
+                        {
+                            apply_mask(
+                                [&](auto row, auto col) { return mask.IsOutOfBound(row, col); });
+                        }
                     }
                 }
             }
@@ -647,7 +690,12 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
             }
             // move K tile windows
             i_page_block_k = k_page_block_navigator.move_tile_window(
-                i_page_block_k, k_dram_block_window, {kN0, 0}, physical_next_block_id_k);
+                i_page_block_k, k_dram_block_window, {k_move_offset, 0}, physical_next_block_id_k);
+            physical_next_block_id_v =
+                amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id(
+                    i_page_block_v, v_dram_window, {0, k_move_offset - kN0}));
+            i_page_block_v = v_page_block_navigator.move_tile_window(
+                i_page_block_v, v_dram_window, {0, k_move_offset - kN0}, physical_next_block_id_v);
             // tail
             {
                 block_sync_lds();
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
index 0b30077a29..944d49a8aa 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
@@ -57,6 +57,7 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
     static constexpr bool kStoreLSE         = Problem::kStoreLSE;
     static constexpr bool kIsPagedKV        = Problem::kIsPagedKV;
     static constexpr bool kHasUnevenSplits  = Problem::kHasUnevenSplits;
+    static constexpr bool kHasSink          = Problem::kHasSink;
 
     static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
                    (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
@@ -256,11 +257,23 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
         set_tile(m, -numeric<SMPLComputeDataType>::infinity());
         clear_tile(l);
 
-        const auto q_origin = q_dram_window.get_window_origin();
-        const auto [logical_seqlen_k_start, logical_seqlen_k_end] = mask.GetTileRangeAlongX(
-            q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
+        const auto q_origin          = q_dram_window.get_window_origin();
+        const auto tile_range_result = [&mask, &q_origin, num_splits, i_split]() {
+            if constexpr(kHasSink)
+                return mask.GetSinkTileRangeAlongX(
+                    q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
+            else
+            {
+                auto [start, end] = mask.GetTileRangeAlongX(
+                    q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
+                return ck_tile::make_tuple(0, start, end);
+            }
+        }();
+        const auto sink_seq_end           = tile_range_result.get(ck_tile::number<0>{});
+        const auto logical_seqlen_k_start = tile_range_result.get(ck_tile::number<1>{});
+        const auto logical_seqlen_k_end   = tile_range_result.get(ck_tile::number<2>{});
 
-        // check early exit if no work to do
+        const auto num_sink_loop = integer_divide_ceil(sink_seq_end, kN0);
         if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
         {
             const index_t logical_num_total_loop =
@@ -304,24 +317,33 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                     return physical_seqlen_k_start_;
                 }
             }();
+        const auto kv_load_start = (sink_seq_end == 0 && aligned_physical_seqlen_k_start > 0)
+                                       ? aligned_physical_seqlen_k_start
+                                       : 0;
         const index_t num_total_loop =
-            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0) +
+            num_sink_loop;
 
         auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window(
-            k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0});
+            k_dram_block_window_lengths, {kv_load_start, 0});
 
-        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+        const auto bias_origin      = bias_dram_block_window_tmp.get_window_origin();
+        const index_t bias_n_offset = [&]() {
+            if constexpr(kHasSink)
+                return kv_load_start;
+            else
+                return logical_seqlen_k_start -
+                       (physical_seqlen_k_start - aligned_physical_seqlen_k_start);
+        }();
         auto bias_dram_window =
             make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
-                             {bias_origin.at(number<0>{}),
-                              logical_seqlen_k_start - (physical_seqlen_k_start -
-                                                        aligned_physical_seqlen_k_start)}, // M/N
+                             {bias_origin.at(number<0>{}), bias_n_offset},
                              Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
             v_dram_block_window_lengths,
-            {0, aligned_physical_seqlen_k_start}, // TODO: hdim split?
+            {0, kv_load_start}, // TODO: hdim split?
             Policy::template MakeVDramTileDistribution<Problem>());
 
         // store Q into LDS
@@ -369,7 +391,13 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
         {
             // STAGE 1, QK gemm
             clear_tile(s_acc); // initialize C
-
+            const bool is_sink_tile  = ((num_sink_loop - 1) == i_total_loops);
+            const auto k_move_offset = [&]() {
+                if constexpr(kHasSink)
+                    return is_sink_tile ? logical_seqlen_k_start - sink_seq_end + kN0 : kN0;
+                else
+                    return kN0;
+            }();
             // load the second tile of the first iteration
             k_block_tile = load_tile(k_dram_window);
 
@@ -494,7 +522,7 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
 #endif
                 }
             }
-            move_tile_window(bias_dram_window, {0, kN0});
+            move_tile_window(bias_dram_window, {0, k_move_offset});
 
             /// TODO: only check in first/last iteration without increasing code size
             if constexpr(kHasUnevenSplits)
@@ -505,7 +533,7 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                     s_acc,
                     -numeric<SMPLComputeDataType>::infinity(),
                     [&,
-                     physical_seqlen_k_start_ = physical_seqlen_k_start,
+                     physical_seqlen_k_start_ = is_sink_tile ? 0 : physical_seqlen_k_start,
                      physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
                         const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
                         if constexpr(kIsPagedKV)
@@ -530,12 +558,26 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                                                            number<kN0>{});
                 if(need_perpixel_check)
                 {
-                    set_tile_if(
-                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
-                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return mask.IsOutOfBound(row, col - kv_l2p_offset);
-                        });
+                    auto apply_mask = [&](auto&& mask_func) {
+                        set_tile_if(
+                            s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                                const auto row =
+                                    q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                                const auto col =
+                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                                return mask_func(row, col - kv_l2p_offset);
+                            });
+                    };
+
+                    if constexpr(kHasSink)
+                    {
+                        apply_mask(
+                            [&](auto row, auto col) { return mask.IsOutOfSinkBound(row, col); });
+                    }
+                    else
+                    {
+                        apply_mask([&](auto row, auto col) { return mask.IsOutOfBound(row, col); });
+                    }
                 }
             }
 
@@ -546,7 +588,7 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
             {
                 // move K tile windows
                 i_page_block_k = k_page_block_navigator.move_tile_window(
-                    i_page_block_k, k_dram_block_window, {kN0, 0});
+                    i_page_block_k, k_dram_block_window, {k_move_offset, 0});
 
                 k_dram_window = make_tile_window(
                     k_dram_block_window,
@@ -742,6 +784,8 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                 // moving k_dram_window is an in-page-block operation, so there is
                 // no need to invoke k_page_block_navigator.move_tile_window() here.
                 move_tile_window(k_dram_window, {0, kK0});
+                i_page_block_v = v_page_block_navigator.move_tile_window(
+                    i_page_block_v, v_dram_window, {0, k_move_offset - kN0});
                 store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
             }
         } while(++i_total_loops < num_total_loop);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 6be6a64b1c..26a4cc905c 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -56,6 +56,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
     static constexpr bool kStoreLSE         = Problem::kStoreLSE;
     static constexpr bool kIsPagedKV        = Problem::kIsPagedKV;
     static constexpr bool kHasUnevenSplits  = Problem::kHasUnevenSplits;
+    static constexpr bool kHasSink          = Problem::kHasSink;
 
     static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
                    (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
@@ -229,9 +230,23 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
         set_tile(m, -numeric<SMPLComputeDataType>::infinity());
         clear_tile(l);
 
-        const auto q_origin = q_dram_window.get_window_origin();
-        const auto [logical_seqlen_k_start, logical_seqlen_k_end] = mask.GetTileRangeAlongX(
-            q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
+        const auto q_origin          = q_dram_window.get_window_origin();
+        const auto tile_range_result = [&mask, &q_origin, num_splits, i_split]() {
+            if constexpr(kHasSink)
+                return mask.GetSinkTileRangeAlongX(
+                    q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
+            else
+            {
+                auto [start, end] = mask.GetTileRangeAlongX(
+                    q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
+                return ck_tile::make_tuple(0, start, end);
+            }
+        }();
+        const auto sink_seq_end           = tile_range_result.get(ck_tile::number<0>{});
+        const auto logical_seqlen_k_start = tile_range_result.get(ck_tile::number<1>{});
+        const auto logical_seqlen_k_end   = tile_range_result.get(ck_tile::number<2>{});
+
+        const auto num_sink_loop = integer_divide_ceil(sink_seq_end, kN0);
 
         // check early exit if no work to do
         if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
@@ -274,24 +289,35 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                     return physical_seqlen_k_start_;
                 }
             }();
+        const auto kv_load_start = (sink_seq_end == 0 && aligned_physical_seqlen_k_start > 0)
+                                       ? aligned_physical_seqlen_k_start
+                                       : 0;
         const index_t num_total_loop =
-            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0) +
+            num_sink_loop;
 
         auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window(
-            k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0});
+            k_dram_block_window_lengths, {kv_load_start, 0});
 
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+
+        const index_t bias_n_offset = [&]() {
+            if constexpr(kHasSink)
+                return kv_load_start;
+            else
+                return logical_seqlen_k_start -
+                       (physical_seqlen_k_start - aligned_physical_seqlen_k_start);
+        }();
+
         auto bias_dram_window =
             make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
-                             {bias_origin.at(number<0>{}),
-                              logical_seqlen_k_start - (physical_seqlen_k_start -
-                                                        aligned_physical_seqlen_k_start)}, // M/N
+                             {bias_origin.at(number<0>{}), bias_n_offset},
                              Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
             v_dram_block_window_lengths,
-            {0, aligned_physical_seqlen_k_start}, // TODO: hdim split?
+            {0, kv_load_start}, // TODO: hdim split?
             Policy::template MakeVDramTileDistribution<Problem>());
 
         auto q_tile = tile_elementwise_in(q_element_func, q);
@@ -320,9 +346,18 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                 store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
                 k_block_tile = load_tile(k_dram_window);
             }
+            const bool is_sink_tile = ((num_sink_loop - 1) == i_total_loops);
+
+            const auto k_move_offset = [&]() {
+                if constexpr(kHasSink)
+                    return is_sink_tile ? logical_seqlen_k_start - sink_seq_end + kN0 : kN0;
+                else
+                    return kN0;
+            }();
+
             auto physical_next_block_id_k =
                 amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
-                    i_page_block_k, k_dram_block_window, {kN0, 0}));
+                    i_page_block_k, k_dram_block_window, {k_move_offset, 0}));
             auto physical_next_block_id_v = amd_wave_read_first_lane(
                 v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
 
@@ -441,7 +476,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
 #endif
                 }
             }
-            move_tile_window(bias_dram_window, {0, kN0});
+            move_tile_window(bias_dram_window, {0, k_move_offset});
 
             /// TODO: only check in first/last iteration without increasing code size
             if constexpr(kHasUnevenSplits)
@@ -452,7 +487,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                     s_acc,
                     -numeric<SMPLComputeDataType>::infinity(),
                     [&,
-                     physical_seqlen_k_start_ = physical_seqlen_k_start,
+                     physical_seqlen_k_start_ = is_sink_tile ? 0 : physical_seqlen_k_start,
                      physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
                         const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
                         if constexpr(kIsPagedKV)
@@ -477,12 +512,26 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                                                            number<kN0>{});
                 if(need_perpixel_check)
                 {
-                    set_tile_if(
-                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
-                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return mask.IsOutOfBound(row, col - kv_l2p_offset);
-                        });
+                    auto apply_mask = [&](auto&& mask_func) {
+                        set_tile_if(
+                            s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                                const auto row =
+                                    q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                                const auto col =
+                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                                return mask_func(row, col - kv_l2p_offset);
+                            });
+                    };
+
+                    if constexpr(kHasSink)
+                    {
+                        apply_mask(
+                            [&](auto row, auto col) { return mask.IsOutOfSinkBound(row, col); });
+                    }
+                    else
+                    {
+                        apply_mask([&](auto row, auto col) { return mask.IsOutOfBound(row, col); });
+                    }
                 }
             }
 
@@ -647,7 +696,12 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             }
             // move K tile windows
             i_page_block_k = k_page_block_navigator.move_tile_window(
-                i_page_block_k, k_dram_block_window, {kN0, 0}, physical_next_block_id_k);
+                i_page_block_k, k_dram_block_window, {k_move_offset, 0}, physical_next_block_id_k);
+            physical_next_block_id_v =
+                amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id(
+                    i_page_block_v, v_dram_window, {0, k_move_offset - kN0}));
+            i_page_block_v = v_page_block_navigator.move_tile_window(
+                i_page_block_v, v_dram_window, {0, k_move_offset - kN0}, physical_next_block_id_v);
             // tail
             {
                 block_sync_lds();
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 7c4a921b70..a192e3f7b0 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -62,6 +62,7 @@ struct BlockFmhaPipelineProblem
     static constexpr bool kHasDropout       = Traits::kHasDropout;
     static constexpr auto QScaleEnum        = Traits::QScaleEnum;
     static constexpr index_t kBlockPerCu    = Traits::kBlockPerCu;
+    static constexpr bool kHasSink          = Traits::kHasSink;
 };
 
 template <typename QDataType_,
@@ -114,6 +115,7 @@ struct BlockFmhaFwdPagedKVPipelineProblem
     static constexpr bool kDoFp8StaticQuant = Traits::kDoFp8StaticQuant;
     static constexpr bool kIsPagedKV        = Traits::kIsPagedKV;
     static constexpr index_t kBlockPerCu    = Traits::kBlockPerCu;
+    static constexpr bool kHasSink          = Traits::kHasSink;
 };
 
 template <typename QDataType_,
@@ -167,6 +169,7 @@ struct BlockFmhaFwdSplitKVPipelineProblem
     static constexpr bool kHasUnevenSplits           = kIsGroupMode || Traits::kHasUnevenSplits;
     static constexpr bool kMergeNumHeadGroupsSeqLenQ = Traits::kMergeNumHeadGroupsSeqLenQ;
     static constexpr index_t kBlockPerCu             = Traits::kBlockPerCu;
+    static constexpr bool kHasSink                   = Traits::kHasSink;
 };
 
 // extract tile size attributes to remove dependency on traits
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
index 9e1eb3bdec..fe825a370a 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -57,6 +57,7 @@ struct BlockFmhaPipelineQRKSVS
     static constexpr auto BiasEnum          = Problem::BiasEnum;
     static constexpr bool kStoreLSE         = Problem::kStoreLSE;
     static constexpr bool kHasDropout       = Problem::kHasDropout;
+    static constexpr bool kHasSink          = Problem::kHasSink;
 
     static constexpr uint32_t DS_READ = 0x100; // Barrier for DS (data share) read
     static constexpr uint32_t MFMA    = 0x008; // Barrier for MFMA (matrix multiply-accumulate)
@@ -233,10 +234,26 @@ struct BlockFmhaPipelineQRKSVS
         clear_tile(l);
 
         const auto q_origin = q_dram_window.get_window_origin();
-        const auto [seqlen_k_start, seqlen_k_end] =
-            mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
 
-        const auto num_total_loop = integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0);
+        const auto tile_range_result = [&mask, &q_origin]() {
+            if constexpr(kHasSink)
+                return mask.GetSinkTileRangeAlongX(
+                    q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+            else
+            {
+                auto [start, end] =
+                    mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+                return ck_tile::make_tuple(0, start, end);
+            }
+        }();
+        const auto sink_seq_end   = tile_range_result.get(ck_tile::number<0>{});
+        const auto seqlen_k_start = tile_range_result.get(ck_tile::number<1>{});
+        const auto seqlen_k_end   = tile_range_result.get(ck_tile::number<2>{});
+
+        const auto kv_load_start = (sink_seq_end == 0 && seqlen_k_start > 0) ? seqlen_k_start : 0;
+        const auto num_sink_loop = integer_divide_ceil(sink_seq_end, kN0);
+        const auto num_total_loop =
+            integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0) + num_sink_loop;
 
         // check early exit if no work to do
         if constexpr(FmhaMask::IsMasking || kPadSeqLenK)
@@ -262,22 +279,22 @@ struct BlockFmhaPipelineQRKSVS
         auto k_dram_block_window =
             make_tile_window(k_dram_block_window_tmp.get_bottom_tensor_view(),
                              k_dram_block_window_tmp.get_window_lengths(),
-                             {seqlen_k_start, 0});
+                             {kv_load_start, 0});
 
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
         auto bias_dram_window =
             make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
-                             {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
+                             {bias_origin.at(number<0>{}), kv_load_start}, // M/N
                              Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0)>(
-            randval_dram_block_window_tmp, seqlen_k_start);
+            randval_dram_block_window_tmp, kv_load_start);
 
         auto v_dram_window =
             make_tile_window(v_dram_block_window_tmp.get_bottom_tensor_view(),
                              v_dram_block_window_tmp.get_window_lengths(),
-                             {0, seqlen_k_start}, // TODO: hdim split?
+                             {0, kv_load_start}, // TODO: hdim split?
                              Policy::template MakeVDramTileDistribution<Problem>());
 
         auto q_tile = tile_elementwise_in(q_element_func, q);
@@ -450,6 +467,11 @@ struct BlockFmhaPipelineQRKSVS
 #endif
                 }
             }
+            if constexpr(kHasSink)
+            {
+                if(i_total_loops == 0)
+                    move_tile_window(bias_dram_window, {0, seqlen_k_start - sink_seq_end});
+            }
             move_tile_window(bias_dram_window, {0, kN0});
             if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
@@ -460,17 +482,34 @@ struct BlockFmhaPipelineQRKSVS
                                                            number<kN0>{});
                 if(need_perpixel_check)
                 {
-                    set_tile_if(
-                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
-                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return !variant.LogitsMask(variant_params,
-                                                       block_indices.batch_idx,
-                                                       row,
-                                                       col,
-                                                       block_indices.qo_head_idx,
-                                                       block_indices.kv_head_idx);
+                    auto apply_mask = [&](auto&& mask_func) {
+                        set_tile_if(
+                            s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                                const auto row =
+                                    q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                                const auto col =
+                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                                return !mask_func(variant_params,
+                                                  block_indices.batch_idx,
+                                                  row,
+                                                  col,
+                                                  block_indices.qo_head_idx,
+                                                  block_indices.kv_head_idx);
+                            });
+                    };
+
+                    if constexpr(kHasSink)
+                    {
+                        apply_mask([&](auto&&... args) {
+                            return variant.LogitsSinkMask(std::forward<decltype(args)>(args)...);
                         });
+                    }
+                    else
+                    {
+                        apply_mask([&](auto&&... args) {
+                            return variant.LogitsMask(std::forward<decltype(args)>(args)...);
+                        });
+                    }
                 }
             }
 
@@ -580,11 +619,23 @@ struct BlockFmhaPipelineQRKSVS
 
             if constexpr(kHasDropout)
             {
-                // K and dropout use the same address in LDS, finish loading from k_lds_window by
-                // gemm_0 to reuse LDS.
                 block_sync_lds();
+                auto randval_ptr = reinterpret_cast<char*>(smem_ptr);
+
+                index_t seq_offset = [&]() {
+                    if constexpr(!kHasSink)
+                        return seqlen_k_start + i_total_loops * kN0;
+
+                    const bool in_sink_phase = (num_sink_loop > i_total_loops);
+                    if(i_total_loops == num_sink_loop)
+                        move_tile_window(randval_dram_window, {0, seqlen_k_start - sink_seq_end});
+
+                    return in_sink_phase ? (kv_load_start + i_total_loops * kN0)
+                                         : (seqlen_k_start + (i_total_loops - num_sink_loop) * kN0);
+                }();
+
                 dropout.template Run<decltype(gemm_0), SMPLComputeDataType, RandValOutputDataType>(
-                    smem_ptr, seqlen_k_start + i_total_loops * kN0, p_compute, randval_dram_window);
+                    randval_ptr, seq_offset, p_compute, randval_dram_window);
             }
 
             block_sync_lds();
@@ -636,6 +687,14 @@ struct BlockFmhaPipelineQRKSVS
                 });
             }
             // move K tile windows
+            if constexpr(kHasSink)
+            {
+                if(i_total_loops == 0)
+                {
+                    move_tile_window(k_dram_block_window, {seqlen_k_start - sink_seq_end, 0});
+                    move_tile_window(v_dram_window, {0, seqlen_k_start - sink_seq_end});
+                }
+            }
             move_tile_window(k_dram_block_window, {kN0, 0});
             // tail
             {
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index e07516cc27..f57b89cf9d 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -62,6 +62,7 @@ struct BlockFmhaPipelineQRKSVSAsync
     static constexpr auto BiasEnum          = Problem::BiasEnum;
     static constexpr bool kStoreLSE         = Problem::kStoreLSE;
     static constexpr bool kHasDropout       = Problem::kHasDropout;
+    static constexpr bool kHasSink          = Problem::kHasSink;
 
     static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
                    (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
@@ -277,11 +278,26 @@ struct BlockFmhaPipelineQRKSVSAsync
         clear_tile(l);
 
         __builtin_amdgcn_sched_barrier(0);
-        const auto q_origin = q_dram_window.get_window_origin();
-        const auto [seqlen_k_start, seqlen_k_end] =
-            mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+        const auto q_origin          = q_dram_window.get_window_origin();
+        const auto tile_range_result = [&mask, &q_origin]() {
+            if constexpr(kHasSink)
+                return mask.GetSinkTileRangeAlongX(
+                    q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+            else
+            {
+                auto [start, end] =
+                    mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+                return ck_tile::make_tuple(0, start, end);
+            }
+        }();
+        const auto sink_seq_end   = tile_range_result.get(ck_tile::number<0>{});
+        const auto seqlen_k_start = tile_range_result.get(ck_tile::number<1>{});
+        const auto seqlen_k_end   = tile_range_result.get(ck_tile::number<2>{});
 
-        const auto num_total_loop = integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0);
+        const auto kv_load_start = (sink_seq_end == 0 && seqlen_k_start > 0) ? seqlen_k_start : 0;
+        const auto num_sink_loop = integer_divide_ceil(sink_seq_end, kN0);
+        const auto num_total_loop =
+            integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0) + num_sink_loop;
 
         // check early exit if no work to do
         if constexpr(FmhaMask::IsMasking || kPadSeqLenK)
@@ -309,7 +325,7 @@ struct BlockFmhaPipelineQRKSVSAsync
         auto k_dram_block_window =
             make_tile_window(k_dram_block_window_tmp.get_bottom_tensor_view(),
                              k_dram_block_window_tmp.get_window_lengths(),
-                             {seqlen_k_start, 0});
+                             {kv_load_start, 0});
 
         auto k_dram_window = make_tile_window(
             k_dram_block_window.get_bottom_tensor_view(),
@@ -332,16 +348,16 @@ struct BlockFmhaPipelineQRKSVSAsync
         auto bias_dram_window =
             make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
-                             {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
+                             {bias_origin.at(number<0>{}), kv_load_start}, // M/N
                              Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0)>(
-            randval_dram_block_window_tmp, seqlen_k_start);
+            randval_dram_block_window_tmp, kv_load_start);
 
         auto v_dram_window =
             make_tile_window(v_dram_block_window_tmp.get_bottom_tensor_view(),
                              v_dram_block_window_tmp.get_window_lengths(),
-                             {0, seqlen_k_start}, // TODO: hdim split?
+                             {0, kv_load_start}, // TODO: hdim split?
                              Policy::template MakeVDramTileDistribution<Problem>());
 
         // prefetch K tile
@@ -478,6 +494,11 @@ struct BlockFmhaPipelineQRKSVSAsync
 #endif
                 }
             }
+            if constexpr(kHasSink)
+            {
+                if(i_total_loops == 0)
+                    move_tile_window(bias_dram_window, {0, seqlen_k_start - sink_seq_end});
+            }
             move_tile_window(bias_dram_window, {0, kN0});
             if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
@@ -489,17 +510,34 @@ struct BlockFmhaPipelineQRKSVSAsync
 
                 if(need_perpixel_check)
                 {
-                    set_tile_if(
-                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
-                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return !variant.LogitsMask(variant_params,
-                                                       block_indices.batch_idx,
-                                                       row,
-                                                       col,
-                                                       block_indices.qo_head_idx,
-                                                       block_indices.kv_head_idx);
+                    auto apply_mask = [&](auto&& mask_func) {
+                        set_tile_if(
+                            s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                                const auto row =
+                                    q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                                const auto col =
+                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                                return !mask_func(variant_params,
+                                                  block_indices.batch_idx,
+                                                  row,
+                                                  col,
+                                                  block_indices.qo_head_idx,
+                                                  block_indices.kv_head_idx);
+                            });
+                    };
+
+                    if constexpr(kHasSink)
+                    {
+                        apply_mask([&](auto&&... args) {
+                            return variant.LogitsSinkMask(std::forward<decltype(args)>(args)...);
                         });
+                    }
+                    else
+                    {
+                        apply_mask([&](auto&&... args) {
+                            return variant.LogitsMask(std::forward<decltype(args)>(args)...);
+                        });
+                    }
                 }
             }
 
@@ -647,11 +685,21 @@ struct BlockFmhaPipelineQRKSVSAsync
             {
                 auto randval_ptr =
                     reinterpret_cast<char*>(smem_ptr) + Policy::template GetSmemSizeKV<Problem>();
+
+                index_t seq_offset = [&]() {
+                    if constexpr(!kHasSink)
+                        return seqlen_k_start + i_total_loops * kN0;
+
+                    const bool in_sink_phase = (num_sink_loop > i_total_loops);
+                    if(i_total_loops == num_sink_loop)
+                        move_tile_window(randval_dram_window, {0, seqlen_k_start - sink_seq_end});
+
+                    return in_sink_phase ? (kv_load_start + i_total_loops * kN0)
+                                         : (seqlen_k_start + (i_total_loops - num_sink_loop) * kN0);
+                }();
+
                 dropout.template Run<decltype(gemm_0), SMPLComputeDataType, RandValOutputDataType>(
-                    randval_ptr,
-                    seqlen_k_start + i_total_loops * kN0,
-                    p_compute,
-                    randval_dram_window);
+                    randval_ptr, seq_offset, p_compute, randval_dram_window);
             }
 
             const auto p = [&]() {
@@ -717,8 +765,16 @@ struct BlockFmhaPipelineQRKSVSAsync
             i_total_loops++;
             if(i_total_loops < num_total_loop)
             {
-                // move K tile windows
+                if constexpr(kHasSink)
+                {
+                    if(i_total_loops == 0)
+                    {
+                        move_tile_window(k_dram_block_window, {seqlen_k_start - sink_seq_end, 0});
+                        move_tile_window(v_dram_window, {0, seqlen_k_start - sink_seq_end});
+                    }
+                }
                 move_tile_window(k_dram_block_window, {kN0, 0});
+
                 k_dram_window.set_window_origin(k_dram_block_window.get_window_origin());
 
                 if constexpr(k1_loops >= 2 &&
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
index 5d224a6adf..26662dafeb 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
@@ -69,6 +69,7 @@ struct BlockFmhaPipelineQRKSVSAsyncTrload
     static constexpr auto BiasEnum          = Problem::BiasEnum;
     static constexpr bool kStoreLSE         = Problem::kStoreLSE;
     static constexpr bool kHasUnevenSplits  = true;
+    static constexpr bool kHasSink          = Problem::kHasSink;
 
     static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
                    (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
index df33a93696..757a852c19 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
@@ -20,8 +20,9 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kStoreLSE_,
           bool kHasDropout_,
           BlockAttentionQuantScaleEnum QScaleEnum_,
-          index_t kBlockPerCu_  = -1, /* overwrite occupancy if not -1 */
-          bool kSkipMinSeqlenQ_ = false /* skip min seqlen q while chunked prefill */>
+          index_t kBlockPerCu_  = -1,    /* overwrite occupancy if not -1 */
+          bool kSkipMinSeqlenQ_ = false, /* skip min seqlen q while chunked prefill */
+          bool kHasSink_        = false>
 struct TileFmhaTraits
 {
     static constexpr bool kPadSeqLenQ       = kPadSeqLenQ_;
@@ -36,6 +37,7 @@ struct TileFmhaTraits
     static constexpr auto QScaleEnum        = QScaleEnum_;
     static constexpr index_t kBlockPerCu    = kBlockPerCu_;
     static constexpr bool kSkipMinSeqlenQ   = kSkipMinSeqlenQ_;
+    static constexpr bool kHasSink          = kHasSink_;
 };
 
 template <index_t kPadHeadDimQ_ /* paddding for hdim_q */,
@@ -65,8 +67,9 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kStoreLSE_, /* set to true if either num_splits > 1 or fwd training is running */
           bool kIsPagedKV_,
           bool kDoFp8StaticQuant_,
-          index_t kBlockPerCu_  = -1, /* overwrite occupancy if not -1 */
-          bool kSkipMinSeqlenQ_ = false /* skip min seqlen q while chunked prefill */>
+          index_t kBlockPerCu_  = -1,    /* overwrite occupancy if not -1 */
+          bool kSkipMinSeqlenQ_ = false, /* skip min seqlen q while chunked prefill */
+          bool kHasSink_        = false>
 struct TileFmhaFwdPagedKVTraits
 {
     static constexpr bool kPadSeqLenQ       = kPadSeqLenQ_;
@@ -81,6 +84,7 @@ struct TileFmhaFwdPagedKVTraits
     static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
     static constexpr index_t kBlockPerCu    = kBlockPerCu_;
     static constexpr bool kSkipMinSeqlenQ   = kSkipMinSeqlenQ_;
+    static constexpr bool kHasSink          = kHasSink_;
 };
 
 template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
@@ -95,7 +99,8 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kIsPagedKV_,
           bool kHasUnevenSplits_,
           bool kMergeNumHeadGroupsSeqLenQ_ = false,
-          index_t kBlockPerCu_             = -1 /* overwrite occupancy if not -1 */>
+          index_t kBlockPerCu_             = -1, /* overwrite occupancy if not -1 */
+          bool kHasSink_                   = false>
 struct TileFmhaFwdSplitKVTraits
 {
     static constexpr bool kPadSeqLenQ       = kPadSeqLenQ_;
@@ -112,6 +117,7 @@ struct TileFmhaFwdSplitKVTraits
     static constexpr bool kHasUnevenSplits           = kHasUnevenSplits_;
     static constexpr bool kMergeNumHeadGroupsSeqLenQ = kMergeNumHeadGroupsSeqLenQ_;
     static constexpr index_t kBlockPerCu             = kBlockPerCu_;
+    static constexpr bool kHasSink                   = kHasSink_;
 };
 
 template <bool kPadSeqLenQ_ /* padding for seqlen_q */,

From 3143a5a480e4fcf216670012fe491b44324f03b6 Mon Sep 17 00:00:00 2001
From: Johannes Graner <johannes.graner@amd.com>
Date: Mon, 15 Dec 2025 08:03:00 +0100
Subject: [PATCH 10/10] [CK Grouped Gemm] Disable split-k kernel for split-k >
 1 with non-contiguous strides (#3405)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Disable kernel for split-k > 1 with non-contiguous strides

* Update device_grouped_gemm_xdl_splitk_cshuffle.hpp

---------

AICK-441 (partial)

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index ec48beb789..1db9fd45b8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -620,7 +620,44 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         bool isWave64  = get_warp_size() == 64;
         for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
         {
-            const auto& a        = arg.gemm_kernel_args_[i].karg_;
+            const auto& a = arg.gemm_kernel_args_[i].karg_;
+
+            // Validate stride requirements for SplitK (k_batch > 1)
+            // TODO: Enable splitK
+            if(a.k_batch > 1)
+            {
+                if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+                {
+                    if(a.StrideC != a.N)
+                    {
+                        if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                        {
+                            std::cout << "[" << __func__ << "] group id: " << i
+                                      << " SplitK (k_batch=" << a.k_batch
+                                      << ") requires contiguous output stride."
+                                      << " For RowMajor layout: StrideC must equal N."
+                                      << " Got StrideC=" << a.StrideC << ", N=" << a.N << std::endl;
+                        }
+                        return false;
+                    }
+                }
+                else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
+                {
+                    if(a.StrideC != a.M)
+                    {
+                        if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                        {
+                            std::cout << "[" << __func__ << "] group id: " << i
+                                      << " SplitK (k_batch=" << a.k_batch
+                                      << ") requires contiguous output stride."
+                                      << " For ColumnMajor layout: StrideC must equal M."
+                                      << " Got StrideC=" << a.StrideC << ", M=" << a.M << std::endl;
+                        }
+                        return false;
+                    }
+                }
+            }
+
             bool group_arg_valid = false;
             if(isWave64)
             {