[CK_TILE] Enable vector stores for C Column Layout part1

2026-06-30 11:47:48 +00:00 · 2025-10-21 06:19:38 -05:00
parent 22a934a229
commit e45991b379
7 changed files with 602 additions and 99 deletions
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -615,9 +615,9 @@ struct buffer_view<address_space_enum::global,
        using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;

        // X contains multiple T
-        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size; // 1

-        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size; // 8

        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                      "wrong! X should contain multiple T");
@@ -650,7 +650,15 @@ struct buffer_view<address_space_enum::global,
        bool constexpr use_amd_buffer_addressing = false;
 #endif

-        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; // 8
+        //static_assert(scalar_per_x_vector > scalar_per_t_vector, "Condition not met: (( scalar_per_x_vector > scalar_per_t_vector ))");
+        //if(threadIdx.x == 0)
+        //{   
+        //    printf("[DEBUG]: BufferView: t_per_x: %d\n",t_per_x);
+        //    printf("[DEBUG]: BufferView: scalar_per_x_vector: %d\n",scalar_per_x_vector);
+        //    printf("[DEBUG]: BufferView: scalar_per_t_vector: %d\n",scalar_per_t_vector);
+        //    printf("[DEBUG]: BufferView: x.size(): %d\n",x.size());
+        //}

        if constexpr(use_amd_buffer_addressing)
        {
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -65,6 +65,24 @@ struct CShuffleEpilogueProblem

    static_assert(NumDTensor == DsLayout::size(),
                  "The size of DsDataType and DsLayout should be the same");
+
+    CK_TILE_HOST static void PrintInfo() {
+        printf("[DEBUG]: CShuffleEpilogueProblem: kBlockSize: %d\n",kBlockSize);
+        printf("[DEBUG]: CShuffleEpilogueProblem: kMPerBlock: %d\n",kMPerBlock);
+        printf("[DEBUG]: CShuffleEpilogueProblem: kNPerBlock: %d\n",kNPerBlock);
+        printf("[DEBUG]: CShuffleEpilogueProblem: MWave: %d\n",MWave);
+        printf("[DEBUG]: CShuffleEpilogueProblem: NWave: %d\n",NWave);
+        printf("[DEBUG]: CShuffleEpilogueProblem: MPerXdl: %d\n",MPerXdl);
+        printf("[DEBUG]: CShuffleEpilogueProblem: NPerXdl: %d\n",NPerXdl);
+        printf("[DEBUG]: CShuffleEpilogueProblem: KPerXdl: %d\n",KPerXdl);
+        printf("[DEBUG]: CShuffleEpilogueProblem: isCTransposed: %d\n",isCTransposed);
+        printf("[DEBUG]: CShuffleEpilogueProblem: MemoryOperation: %d\n",static_cast<int>(MemoryOperation));
+        printf("[DEBUG]: CShuffleEpilogueProblem: FixedVectorSize: %d\n",static_cast<int>(FixedVectorSize));
+        printf("[DEBUG]: CShuffleEpilogueProblem: VectorSizeC: %d\n",VectorSizeC);
+        printf("[DEBUG]: CShuffleEpilogueProblem: TiledMMAPermuteN: %d\n",static_cast<int>(TiledMMAPermuteN));
+        printf("[DEBUG]: CShuffleEpilogueProblem: kNumWaveGroups: %d\n",kNumWaveGroups);
+        printf("[DEBUG]: CShuffleEpilogueProblem: NumDTensor: %d\n",NumDTensor);
+    }
 };

 template <typename Problem_, typename Policy_ = void>
@@ -122,6 +140,19 @@ struct CShuffleEpilogue
    CDElementwise elfunc_;

    CK_TILE_DEVICE CShuffleEpilogue(CDElementwise elfunc = CDElementwise{}) : elfunc_(elfunc) {};
+    static constexpr bool IsERowMajor =
+        std::is_same_v<ELayout, tensor_layout::gemm::RowMajor> ? true : false;
+
+    CK_TILE_HOST static void PrintInfo() {
+        printf("[DEBUG]: CShuffleEpilogue: MPerIteration: %d\n",MPerIteration);
+        printf("[DEBUG]: CShuffleEpilogue: NPerIteration: %d\n",NPerIteration);
+        printf("[DEBUG]: CShuffleEpilogue: MRepeat: %d\n",MRepeat);
+        printf("[DEBUG]: CShuffleEpilogue: NRepeat: %d\n",NRepeat);
+        printf("[DEBUG]: CShuffleEpilogue: GetVectorSizeC: %d\n",GetVectorSizeC());
+        printf("[DEBUG]: CShuffleEpilogue: get_warp_size: %d\n",get_warp_size());
+        printf("[DEBUG]: CShuffleEpilogue: MPerIterationShuffle: %d\n",MPerIterationShuffle);
+        printf("[DEBUG]: CShuffleEpilogue: NPerIterationShuffle: %d\n",NPerIterationShuffle);
+    }

    static_assert(NumDTensor == DsLayout::size(),
                  "The size of DsDataType and DsLayout should be the same");
@@ -248,6 +279,22 @@ struct CShuffleEpilogue
    static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle);
    static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle);

+    static constexpr index_t NumYXdlPerWavePerShuffle =
+        IsERowMajor ? NumMXdlPerWavePerShuffle : NumNXdlPerWavePerShuffle;
+    static constexpr index_t NumXXdlPerWavePerShuffle =
+        IsERowMajor ? NumNXdlPerWavePerShuffle : NumMXdlPerWavePerShuffle;
+
+    static constexpr index_t YPerIterationShuffle =
+        IsERowMajor ? MPerIterationShuffle : NPerIterationShuffle;
+    static constexpr index_t XPerIterationShuffle =
+        IsERowMajor ? NPerIterationShuffle : MPerIterationShuffle;
+
+    static constexpr index_t YPerBlock = IsERowMajor ? kMPerBlock : kNPerBlock;
+    static constexpr index_t XPerBlock = IsERowMajor ? kNPerBlock : kMPerBlock;
+
+    static constexpr index_t YWave = IsERowMajor ? MWave : NWave;
+    static constexpr index_t XWave = IsERowMajor ? NWave : MWave;
+
    using WG = WarpGemmDispatcher<ATypeToUse,
                                  BTypeToUse,
                                  AccDataType,
@@ -277,8 +324,8 @@ struct CShuffleEpilogue
        else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
        {
            return make_naive_tensor_descriptor(
-                make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
-                make_tuple(number<1>{}, number<MPerIterationShuffle>{}));
+                make_tuple(number<NPerIterationShuffle>{}, number<MPerIterationShuffle>{}),
+                make_tuple(number<MPerIterationShuffle>{}, number<1>{}));
        }
        else
        {
@@ -292,8 +339,8 @@ struct CShuffleEpilogue
            if constexpr(BlockedXDLN_PerWarp == 1)
            {
                return tile_distribution_encoding<sequence<>,
-                                                  tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
-                                                        sequence<NumNXdlPerWavePerShuffle, NWave>>,
+                                                  tuple<sequence<NumYXdlPerWavePerShuffle, YWave>,
+                                                        sequence<NumXXdlPerWavePerShuffle, XWave>>,
                                                  tuple<sequence<1, 2>>,
                                                  tuple<sequence<1, 1>>,
                                                  sequence<1, 2>,
@@ -301,12 +348,12 @@ struct CShuffleEpilogue
            }
            else
            {
-                constexpr int RakedXDLN_PerWarp = NumNXdlPerWavePerShuffle / BlockedXDLN_PerWarp;
+                constexpr int RakedXDLN_PerWarp = NumXXdlPerWavePerShuffle / BlockedXDLN_PerWarp;
                // BlockedLayout
                return tile_distribution_encoding<
                    sequence<>,
-                    tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
-                          sequence<RakedXDLN_PerWarp, NWave, BlockedXDLN_PerWarp>>,
+                    tuple<sequence<NumYXdlPerWavePerShuffle, YWave>,
+                          sequence<RakedXDLN_PerWarp, XWave, BlockedXDLN_PerWarp>>,
                    tuple<sequence<1, 2>>,
                    tuple<sequence<1, 1>>,
                    sequence<1, 2, 2>,
@@ -622,22 +669,65 @@ struct CShuffleEpilogue
        auto o_lds_block              = make_tensor_view<address_space_enum::lds>(
            static_cast<ODataType*>(p_smem), lds_block_desc);

-        auto in_lds_window = make_tile_window(
-            o_lds_block,
-            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
-            {0, 0},
-            LdsTileDistr);
+        auto in_lds_window = [&o_lds_block, &LdsTileDistr] {
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_tile_window(
+                            o_lds_block,
+                            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+                            {0, 0},
+                            LdsTileDistr);
+            }
+            else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return make_tile_window(
+                            o_lds_block,
+                            make_tuple(number<NPerIterationShuffle>{}, number<MPerIterationShuffle>{}),
+                            {0, 0},
+                            LdsTileDistr);
+            }
+            else
+            {
+                static_assert(false, "Unsupported ELayout!");
+            }
+        }();
+        //auto in_lds_window = make_tile_window(
+        //    o_lds_block,
+        //    make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+        //    {0, 0},
+        //    LdsTileDistr);

-        auto out_lds_window = make_tile_window(
-            o_lds_block,
-            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
-            {0, 0});
+        //auto out_lds_window = make_tile_window(
+        //    o_lds_block,
+        //    make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+        //    {0, 0});
+
+        auto out_lds_window = [&o_lds_block] {
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_tile_window(
+                            o_lds_block,
+                            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+                            {0, 0});
+            }
+            else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
+            {
+               return make_tile_window(
+                            o_lds_block,
+                            make_tuple(number<NPerIterationShuffle>{}, number<MPerIterationShuffle>{}),
+                            {0, 0});
+            }
+            else
+            {
+                static_assert(false, "Unsupported ELayout!");
+            }
+        }();

        constexpr index_t num_access = SFC::get_num_of_access();
-
-        static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
-                      "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
-
+        // TODO: Add support for Col Major Output Layout - CShuffle Epilogue
+        //static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
+        //              "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
+        static_assert(GetVectorSizeC() > 1, "VectorSizeC is not greater than 1!");
        using TileEncodingPattern =
            tile_distribution_encoding_pattern_2d<kBlockSize,
                                                  MPerIterationShuffle,
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -740,9 +740,9 @@ struct UniversalGemmKernel
            {
                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
                    e_ptr,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_E),
-                    number<1>{},
+                    make_tuple(kargs.N, kargs.M),
+                    make_tuple(kargs.stride_E, 1),
+                    number<EpiloguePipeline::GetVectorSizeC()>{},
                    number<1>{});
            }
        }();
@@ -831,9 +831,9 @@ struct UniversalGemmKernel
            else
            {
                return pad_tensor_view(e_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<GemmPipeline::kPadM, false>{});
+                                       make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                  number<TilePartitioner::MPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadM>{});
            }
        }();

@@ -929,10 +929,25 @@ struct UniversalGemmKernel
            },
            number<NumDTensor>{});

-        auto e_block_window = make_tile_window(
-            e_pad_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            {i_m, i_n});
+
+
+        const auto e_block_window = [&] () {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_tile_window(e_pad_view,
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                    number<TilePartitioner::NPerBlock>{}),
+                                        {i_m, i_n});
+            }
+            else
+            {
+                return make_tile_window(e_pad_view,
+                                        make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                    number<TilePartitioner::MPerBlock>{}),
+                                        {i_n, i_m});
+            }
+        }();
+

        return make_tuple(as_block_window, bs_block_window, ds_block_window, e_block_window);
    }
@@ -986,7 +1001,19 @@ struct UniversalGemmKernel
            // Run Epilogue Pipeline
            auto& c_block_window = gemm_tile_windows.at(I3);

+            //if(threadIdx.x == 0)
+            //{  
+            //    printf("CShuffleEpilogue operator() called! Before\n");
+            //    c_block_window.template print_tile_window_range<EDataType>(0, 4, 0, 8, "A");
+            //}
+
            EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+
+            //if(threadIdx.x == 0)
+            //{  
+            //    printf("CShuffleEpilogue operator() called! After\n");
+            //    c_block_window.template print_tile_window_range<EDataType>(0, 4, 0, 8, "A");
+           // }
        }
    }

--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -78,46 +78,46 @@ using KernelTypesMemWmma = ::testing::Types<
 >;

 using KernelTypesCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
+    std::tuple<    Row,     Row,     Col,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
+    //std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    //std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
 >;

 using KernelTypesCompV3Wmma = ::testing::Types<
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                           GemmConfig::kPadN,
+                                           GemmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        const dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw ArgumentsNotSupportedException(
+                "Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+            static constexpr ck_tile::index_t APackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+            static constexpr ck_tile::index_t BPackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        //if(args.k_batch == 1)
+        //{
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        //}
+        //else
+        //{
+        //    Run(has_hot_loop_,
+        //        tail_number_,
+        //        ck_tile::integral_constant<ck_tile::memory_operation_enum,
+        //                                   ck_tile::memory_operation_enum::atomic_add>{});
+        //}
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+bool run_gemm_test_prec_type(std::string a_layout,
+                             std::string b_layout,
+                             ck_tile::ArgParser& arg_parser)
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Row{}, Row{});
+        }
+        else if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+}
+
+template <typename GemmConfig, typename APrecType, typename BPrecType, typename CPrecType>
+bool run_gemm_test(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    std::string a_layout = arg_parser.get_str("a_layout");
+    std::string b_layout = arg_parser.get_str("b_layout");
+
+    return run_gemm_test_prec_type<GemmConfig, APrecType, BPrecType, CPrecType>(
+        a_layout, b_layout, arg_parser);
+}
+
+template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
+int run_gemm_combinations()
+{
+    // Define possible values for each parameter
+    std::vector<std::string> m_values = {"512", "1024"};
+    std::vector<std::string> n_values = {"512", "2048"};
+    std::vector<std::string> k_values = {"512", "1024"};
+
+    // We'll store all our arguments as strings first
+    std::vector<std::string> arg_strings = {"./bin/tile_example_gemm_universal",
+                                            "", // m placeholder
+                                            "", // n placeholder
+                                            "", // k placeholder
+                                            "-stride_a=0",
+                                            "-stride_b=0",
+                                            "-stride_c=0",
+                                            "-v=2",
+                                            "-warmup=0",
+                                            "-repeat=1"};
+
+    // Create an array of const char pointers for argv
+    constexpr size_t ARG_COUNT   = 10;
+    constexpr size_t ARG_MAX_LEN = 64;
+    char args[ARG_COUNT][ARG_MAX_LEN];
+    char* argv[ARG_COUNT];
+
+    // Run all combinations
+    bool is_success = true;
+    for(const auto& m : m_values)
+    {
+        arg_strings[1] = "-m=" + m;
+
+        for(const auto& n : n_values)
+        {
+            arg_strings[2] = "-n=" + n;
+
+            for(const auto& k : k_values)
+            {
+                arg_strings[3] = "-k=" + k;
+
+                // Set up the argv array with pointers to the string data
+                for(size_t i = 0; i < ARG_COUNT; i++)
+                {
+                    strncpy(args[i], arg_strings[i].c_str(), ARG_MAX_LEN);
+                    argv[i] = args[i];
+                }
+
+                std::cout << "Arguments received: ";
+                for(size_t i = 1; i < ARG_COUNT; ++i)
+                {
+                    std::cout << argv[i] << " ";
+                }
+                std::cout << std::endl;
+
+                // Call the function with the current configuration
+                try
+                {
+#if CK_TILE_USE_WMMA
+                    is_success = run_gemm_test<GemmConfigComputeV3_WMMA<CPrecType>,
+                                               APrecType,
+                                               BPrecType,
+                                               CPrecType>(ARG_COUNT, argv) &&
+                                 is_success;
+#else
+                    is_success = run_gemm_test<GemmConfigComputeV3<CPrecType>,
+                                               APrecType,
+                                               BPrecType,
+                                               CPrecType>(ARG_COUNT, argv) &&
+                                 is_success;
+                    is_success = run_gemm_test<GemmConfigComputeV3_2<CPrecType>,
+                                               APrecType,
+                                               BPrecType,
+                                               CPrecType>(ARG_COUNT, argv) &&
+                                 is_success;
+#endif
+                }
+                catch(const ArgumentsNotSupportedException& e)
+                {
+                    std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
+                    // ArgumentsNotSupportedException  is not an error. Do not change is_success
+                }
+                catch(const std::runtime_error& e)
+                {
+                    std::cerr << "Caught runtime error: " << e.what() << '\n';
+                    is_success = false;
+                }
+            }
+        }
+    }
+    return is_success;
+}
--- a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
@@ -120,9 +120,9 @@ TYPED_TEST(TEST_SUITE_NAME, PaddK)

 TYPED_TEST(TEST_SUITE_NAME, Regular)
 {
-    std::vector<int> Ms{512};
-    constexpr int N = 1024;
-    constexpr int K = 512;
+    std::vector<int> Ms{128};
+    constexpr int N = 128;
+    constexpr int K = 128;

    for(int M : Ms)
        this->Run(M, N, K);
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -174,6 +174,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                                     Persistent,
                                                                     NumWaveGroup,
                                                                     preshuffle>;
+        printf("[DEBUG] VectorSize_: %d\n", GemmUniversalTraits::_VectorSize);

        using GemmPipelineProblem =
            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -224,6 +225,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                 K_Warp_Tile,
                                                 UniversalGemmProblem::TransposeC,
                                                 memory_operation>>;
+            //GemmEpilogue::Problem::PrintInfo();
+            //GemmEpilogue::PrintInfo();

            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
            auto kargs   = Kernel::MakeKernelArgs(args);
@@ -256,20 +259,20 @@ class TestCkTileGemmPipeline : public ::testing::Test
        };

        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-            if(args.k_batch == 1)
-            {
+            //if(args.k_batch == 1)
+            //{
                Run(has_hot_loop_,
                    tail_number_,
                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
                                               ck_tile::memory_operation_enum::set>{});
-            }
-            else
-            {
-                Run(has_hot_loop_,
-                    tail_number_,
-                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                               ck_tile::memory_operation_enum::atomic_add>{});
-            }
+            //}
+            //else
+            //{
+            //    Run(has_hot_loop_,
+            //        tail_number_,
+            //        ck_tile::integral_constant<ck_tile::memory_operation_enum,
+            //                                   ck_tile::memory_operation_enum::atomic_add>{});
+            //}
        };

        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
@@ -284,17 +287,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
        {
            GTEST_SKIP() << "Unsupported data type combination for gemm pipeline test.";
        }
-        if constexpr(PipelineType == GemmPipelineType::CompV4 ||
-                     std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-        {
+        //if constexpr(PipelineType == GemmPipelineType::CompV4 ||
+        //             std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        //{
            // Only do k_batch = 1 when pipeline is CompV4, or BDataType is I4
-            k_batches_ = {1};
-        }
-        else
-        {
-            // Otherwise, use k_batch = 1 and 2
-            k_batches_ = {1, 2};
-        }
+        k_batches_ = {1};
+        //}
+        //else
+        //{
+        //    // Otherwise, use k_batch = 1 and 2
+        //    k_batches_ = {1, 2};
+        //}
    }

    template <bool PadM = true, bool PadN = true, bool PadK = true, bool Preshuffle = false>
@@ -338,8 +341,15 @@ class TestCkTileGemmPipeline : public ::testing::Test
        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));

-        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, 11939}(a_m_k);
-        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, 11940}(b_k_n);
+        std::cout << "a_m_k: ";
+        a_m_k.print_first_n(std::cout) << '\n';
+        std::cout << "b_k_n: ";
+        b_k_n.print_first_n(std::cout) << '\n';
+        std::cout << "c_m_n_dev_result: ";
+        c_m_n_dev_result.print_first_n(std::cout) << '\n';
+
+        ck_tile::FillUniformDistributionIntegerValue<ADataType>{1, 2, 11939}(a_m_k);
+        ck_tile::FillUniformDistributionIntegerValue<BDataType>{1, 2, 11940}(b_k_n);

        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
@@ -372,7 +382,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                      stride_B,
                                      stride_C};

-        invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false});
+        invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false, 2});

        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
        bool pass = true;
@@ -384,6 +394,13 @@ class TestCkTileGemmPipeline : public ::testing::Test
        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
            a_m_k, b_k_n, c_m_n_host_ref);

+        std::cout << "a_m_k: ";
+        a_m_k.print_first_n(std::cout) << '\n';
+        std::cout << "b_k_n: ";
+        b_k_n.print_first_n(std::cout) << '\n';
+        std::cout << "c_m_n_dev_result: ";
+        c_m_n_dev_result.print_first_n(std::cout) << '\n';
+
        const float max_accumulated_value =
            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(