From 376a62fcc11bbe6704d24499eeaf9c16fd73b5dc Mon Sep 17 00:00:00 2001
From: SamiAario-AMD <samaario@amd.com>
Date: Thu, 13 Nov 2025 21:01:27 +0200
Subject: [PATCH] Remove "basic" and universal GEMM tests, and incorporate
 their test cases into the GEMM pipeline tests (#3094)

* Add missing copyright statements

* Use ck_tile::host_tensor_descriptor instead of a custom lambda

* Refactor use of check_data_type in test classes

* Use TEST_SUITE_NAME with TYPED_TEST_SUITE

* Remove an unused namespace

* Make dim3 const

* Add BF8 x BF8 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add F8 x BF8 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add BF16 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add BF16 x BF16 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add BF8 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add F8 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add F16 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Skip failing tests of F16 x I4 for CompV3 with K == 2 * K_Tile

* Add missing precision type combinations to CompV4 from CompV3

* Move the INT8 tests around for consistency with KernelTypesCompV3Wmma

* Add missing precision type combinations to CompV3Wmma from CompV3

* Remove the basic and universal tests and their dependencies

* On __gfx950__, avoid using transposed loading of A with datatype pk_int4_t of B

* Use ADataType and BDataType instead of ComputeDataType for WarpGemm

* Explicitly set some return types to void

* Use more general typenames in InterleavedPKTypeLoader

* Add load_interleaved_pk_type.hpp to common.hpp

* Use std::is_same_v in load_int4_tile

* Add handling of LoadTranspose to load_int4_tile

* Factor out common code in several places using load_int4_tile

* Add support for pk_int4_t using load_int4_tile

* Fix formatting

[ROCm/composable_kernel commit: f2cfc6b94ee3154697030c4dfa214040bb4af4c9]
---
 include/ck_tile/core/tensor/tile_window.hpp   |  10 +-
 .../ops/common/load_interleaved_pk_type.hpp   |  29 +-
 .../block/block_universal_gemm_as_bs_cr.hpp   |  94 +---
 .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp  |  17 +-
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |  38 +-
 .../block_universal_gemm_as_aquant_bs_cr.hpp  |  24 +-
 .../block_universal_gemm_as_bs_bquant_cr.hpp  |  24 +-
 .../gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp   |   1 -
 test/ck_tile/gemm/CMakeLists.txt              |  54 +--
 .../gemm/test_gemm_pipeline_basic_bf16.cpp    |  13 -
 .../gemm/test_gemm_pipeline_basic_bf8.cpp     |  13 -
 .../gemm/test_gemm_pipeline_basic_cases.hpp   |  25 -
 .../gemm/test_gemm_pipeline_basic_fp16.cpp    |  13 -
 .../gemm/test_gemm_pipeline_basic_fp8.cpp     |  14 -
 .../test_gemm_pipeline_basic_run_test.inc     | 218 ---------
 .../gemm/test_gemm_pipeline_comp_async.cpp    |   6 +-
 .../gemm/test_gemm_pipeline_compv3.cpp        |  19 +-
 .../gemm/test_gemm_pipeline_compv3_wmma.cpp   |  17 +-
 .../gemm/test_gemm_pipeline_compv4.cpp        |  12 +-
 .../gemm/test_gemm_pipeline_compv4_wmma.cpp   |   2 +-
 .../gemm/test_gemm_pipeline_compv6.cpp        |   7 +-
 .../gemm/test_gemm_pipeline_kernel_types.hpp  |  96 +++-
 test/ck_tile/gemm/test_gemm_pipeline_mem.cpp  |   5 +
 .../gemm/test_gemm_pipeline_persistent.cpp    |   2 +
 .../test_gemm_pipeline_smoke_run_test.inc     | 392 ---------------
 .../gemm/test_gemm_pipeline_smoke_util.hpp    | 450 ------------------
 .../test_gemm_pipeline_type_param_product.hpp |  63 ---
 .../test_gemm_pipeline_universal_bf16.cpp     |  16 -
 .../gemm/test_gemm_pipeline_universal_bf8.cpp |  16 -
 .../test_gemm_pipeline_universal_cases.hpp    |  25 -
 .../test_gemm_pipeline_universal_fp16.cpp     |  16 -
 .../gemm/test_gemm_pipeline_universal_fp8.cpp |  17 -
 .../test_gemm_pipeline_universal_int8.cpp     |  16 -
 .../test_gemm_pipeline_universal_pk_int4.cpp  |  16 -
 .../test_gemm_pipeline_universal_run_test.inc | 260 ----------
 .../gemm/test_gemm_pipeline_ut_cases.inc      |  35 +-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp | 128 ++---
 .../gemm/test_gemm_pipeline_wmma_base.hpp     |  37 +-
 38 files changed, 352 insertions(+), 1888 deletions(-)
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_cases.hpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_type_param_product.hpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_cases.hpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc

diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 1123ce7604..ea459417d2 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -204,7 +204,7 @@ struct tile_window_with_static_distribution
               typename ElementWise_,
               index_t i_access_unsupport_ = -1,
               bool oob_conditional_check  = true>
-    CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+    CK_TILE_DEVICE void load(DistributedTensor& dst_tensor,
                              const TileWindow_& tile_window,
                              ElementWise_ elementwise,
                              number<i_access_unsupport_>          = {},
@@ -283,7 +283,7 @@ struct tile_window_with_static_distribution
     template <typename DistributedTensor,
               index_t i_access_unsupport_ = -1,
               bool oob_conditional_check  = true>
-    CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+    CK_TILE_DEVICE void load(DistributedTensor& dst_tensor,
                              number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
     {
@@ -431,7 +431,7 @@ struct tile_window_with_static_distribution
               index_t i_access_unsupport_ = -1,
               bool oob_conditional_check  = true,
               bool pre_nop                = false>
-    CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+    CK_TILE_DEVICE void async_load_raw(LdsTileWindow_&& lds_tile,
                                        number<i_access_unsupport_>          = {},
                                        bool_constant<oob_conditional_check> = {},
                                        bool_constant<pre_nop>               = {}) const
@@ -515,7 +515,7 @@ struct tile_window_with_static_distribution
               index_t i_access_unsupport_ = -1,
               bool oob_conditional_check  = true,
               typename = std::enable_if_t<std::is_class_v<remove_cvref_t<LdsTileWindow_>>>>
-    CK_TILE_DEVICE auto async_load_with_offset(index_t offset,
+    CK_TILE_DEVICE void async_load_with_offset(index_t offset,
                                                LdsTileWindow_&& lds_tile,
                                                number<i_access_unsupport_>          = {},
                                                bool_constant<oob_conditional_check> = {}) const
@@ -605,7 +605,7 @@ struct tile_window_with_static_distribution
               typename DistributedTensor,
               index_t i_access_unsupport_ = -1,
               bool oob_conditional_check  = true>
-    CK_TILE_DEVICE auto load_transpose_with_offset(index_t offset,
+    CK_TILE_DEVICE void load_transpose_with_offset(index_t offset,
                                                    DistributedTensor& dst_tensor,
                                                    number<i_access_unsupport_>          = {},
                                                    bool_constant<oob_conditional_check> = {}) const
diff --git a/include/ck_tile/ops/common/load_interleaved_pk_type.hpp b/include/ck_tile/ops/common/load_interleaved_pk_type.hpp
index fb7a05044f..91fa61763a 100644
--- a/include/ck_tile/ops/common/load_interleaved_pk_type.hpp
+++ b/include/ck_tile/ops/common/load_interleaved_pk_type.hpp
@@ -8,16 +8,7 @@
 
 namespace ck_tile {
 
-template <class T>
-struct is_pk_int4 : std::false_type
-{
-};
-template <>
-struct is_pk_int4<pk_int4_t> : std::true_type
-{
-};
-
-template <typename ComputeDataType, index_t UnaryOpSize>
+template <typename DstDataType, index_t UnaryOpSize>
 struct InterleavedPKTypeLoader
 {
     template <typename WarpWindow, typename WarpTile>
@@ -30,24 +21,30 @@ struct InterleavedPKTypeLoader
         constexpr index_t thread_buffer_size = WarpTile::get_thread_buffer_size() / UnaryOpSize;
         const auto in_dstr_tensors           = load_tile(warp_window);
 
-        using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
+        using DstVectorType = DstDataType __attribute__((ext_vector_type(UnaryOpSize)));
         static_for<0, thread_buffer_size, 1>{}([&](auto i) {
-            elementwise_op(warp_tile.get_thread_buffer().template get_as<ComputeVectorType>()(i),
+            elementwise_op(warp_tile.get_thread_buffer().template get_as<DstVectorType>()(i),
                            in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]);
         });
     }
 };
 
-template <typename BDataType,
-          typename ComputeDataType,
+template <typename SrcDataType,
+          typename DstDataType,
           index_t UnaryOpSize,
+          bool LoadTranspose = false,
           typename WarpTile,
           typename WarpWindow>
 CK_TILE_DEVICE void load_int4_tile(WarpTile& dst, const WarpWindow& src)
 {
-    if constexpr(is_pk_int4<std::remove_cv_t<BDataType>>::value)
+    if constexpr(std::is_same_v<SrcDataType, pk_int4_t>)
     {
-        InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize>::load_interleaved_pk_type(dst, src);
+        static_assert(!LoadTranspose, "LoadTranspose not supported with pk_int4_t");
+        InterleavedPKTypeLoader<DstDataType, UnaryOpSize>::load_interleaved_pk_type(dst, src);
+    }
+    else if constexpr(LoadTranspose)
+    {
+        dst = load_tile_transpose(src);
     }
     else
     {
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index feea1ffa96..75a424e31e 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -94,7 +94,11 @@ struct BlockUniversalGemmAsBsCr
     using ComputeDataType = remove_cvref_t<typename Traits::ComputeDataType>;
     using CDataType       = remove_cvref_t<typename Traits::CDataType>;
 
-    using Loader   = remove_cvref_t<InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize_>>;
+    using ATypeToUse =
+        std::conditional_t<std::is_same_v<ADataType, pk_int4_t>, BDataType, ADataType>;
+    using BTypeToUse =
+        std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+
     using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
 
     static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
@@ -196,8 +200,8 @@ struct BlockUniversalGemmAsBsCr
         static constexpr auto BLdsTileDistr =
             decltype(make_static_tile_distribution(MakeBBlockDistributionEncode())){};
 
-        using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
-        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
+        using ALdsTile = decltype(make_static_distributed_tensor<ATypeToUse>(ALdsTileDistr));
+        using BLdsTile = decltype(make_static_distributed_tensor<BTypeToUse>(BLdsTileDistr));
 
         ALdsTile a_warp_tile_;
         BLdsTile b_warp_tile_;
@@ -222,22 +226,10 @@ struct BlockUniversalGemmAsBsCr
                           "The ADataType and BDataType as defined in "
                           "traits should be the same as correspoinding block window data type!");
 
-            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-            {
-                Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
-            }
-            else
-            {
-                load_tile(a_warp_tile_, a_block_window);
-            }
-            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-            {
-                Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
-            }
-            else
-            {
-                load_tile(b_warp_tile_, b_block_window);
-            }
+            load_int4_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(a_warp_tile_,
+                                                                                a_block_window);
+            load_int4_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(b_warp_tile_,
+                                                                                b_block_window);
             // hot loop:
             static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                 static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
@@ -285,8 +277,8 @@ struct BlockUniversalGemmAsBsCr
         static constexpr auto BLdsTileDistr =
             decltype(make_static_tile_distribution(MakeBBlockDistributionEncode())){};
 
-        using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
-        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
+        using ALdsTile = decltype(make_static_distributed_tensor<ATypeToUse>(ALdsTileDistr));
+        using BLdsTile = decltype(make_static_distributed_tensor<BTypeToUse>(BLdsTileDistr));
 
         ALdsTile a_warp_tile_;
         BLdsTile b_warp_tile_;
@@ -300,30 +292,10 @@ struct BlockUniversalGemmAsBsCr
                                           bool_constant<ALoadTranspose> = {},
                                           bool_constant<BLoadTranspose> = {})
         {
-            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-            {
-                Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
-            }
-            else if constexpr(ALoadTranspose)
-            {
-                a_warp_tile_ = load_tile_transpose(a_block_window);
-            }
-            else
-            {
-                load_tile(a_warp_tile_, a_block_window);
-            }
-            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-            {
-                Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
-            }
-            else if constexpr(BLoadTranspose)
-            {
-                b_warp_tile_ = load_tile_transpose(b_block_window);
-            }
-            else
-            {
-                load_tile(b_warp_tile_, b_block_window);
-            }
+            load_int4_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(a_warp_tile_,
+                                                                                a_block_window);
+            load_int4_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(b_warp_tile_,
+                                                                                b_block_window);
         }
 
         // C += A * B
@@ -396,8 +368,8 @@ struct BlockUniversalGemmAsBsCr
         static constexpr auto BLdsTileDistr =
             make_static_tile_distribution(MakeBBlockDistributionEncode());
 
-        using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
-        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
+        using ALdsTile = decltype(make_static_distributed_tensor<ATypeToUse>(ALdsTileDistr));
+        using BLdsTile = decltype(make_static_distributed_tensor<BTypeToUse>(BLdsTileDistr));
 
         ALdsTile a_warp_tile_;
         BLdsTile b_warp_tile_;
@@ -451,30 +423,10 @@ struct BlockUniversalGemmAsBsCr
             auto b_lds_gemm_window = make_tile_window(
                 b_block_window.get_bottom_tensor_view(), b_lds_shape, b_offset, b_lds_load_distr);
 
-            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-            {
-                Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
-            }
-            else if constexpr(ALoadTranspose)
-            {
-                a_warp_tile_ = load_tile_transpose(a_lds_gemm_window);
-            }
-            else
-            {
-                load_tile(a_warp_tile_, a_lds_gemm_window);
-            }
-            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-            {
-                Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
-            }
-            else if constexpr(BLoadTranspose)
-            {
-                b_warp_tile_ = load_tile_transpose(b_lds_gemm_window);
-            }
-            else
-            {
-                load_tile(b_warp_tile_, b_lds_gemm_window);
-            }
+            load_int4_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(a_warp_tile_,
+                                                                                a_lds_gemm_window);
+            load_int4_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(b_warp_tile_,
+                                                                                b_lds_gemm_window);
         }
 
         // C += A * B
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
index b5584f98df..a05e07bbc4 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -26,8 +26,21 @@ struct GemmPipelineAgBgCrImplBase
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
 #if defined(__gfx950__)
-    static constexpr bool is_a_load_tr = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
-    static constexpr bool is_b_load_tr = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+    // The combination of pk_int4_t and transposed loading causes numerical errors.
+    // Therefore do not use transposed loading in this case.
+    static constexpr bool is_a_load_tr = []() {
+        if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            return false;
+        else
+            return std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+    }();
+
+    static constexpr bool is_b_load_tr = []() {
+        if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            return false;
+        else
+            return std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+    }();
 #else
     static constexpr bool is_a_load_tr = false;
     static constexpr bool is_b_load_tr = false;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index ecff6fe497..404c88fbf8 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -33,12 +33,27 @@ template <typename Derived>
 struct UniversalGemmBasePolicy
 {
 #if defined(__gfx950__)
+    // The combination of pk_int4_t and transposed loading causes numerical errors.
+    // Therefore do not use transposed loading in this case.
     template <typename Problem>
-    static constexpr bool is_a_load_tr =
-        std::is_same_v<remove_cvref_t<typename Problem::ALayout>, tensor_layout::gemm::ColumnMajor>;
+    static constexpr bool is_a_load_tr = []() {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            return false;
+        else
+            return std::is_same_v<remove_cvref_t<typename Problem::ALayout>,
+                                  tensor_layout::gemm::ColumnMajor>;
+    }();
+
     template <typename Problem>
-    static constexpr bool is_b_load_tr =
-        std::is_same_v<remove_cvref_t<typename Problem::BLayout>, tensor_layout::gemm::RowMajor>;
+    static constexpr bool is_b_load_tr = []() {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            return false;
+        else
+            return std::is_same_v<remove_cvref_t<typename Problem::BLayout>,
+                                  tensor_layout::gemm::RowMajor>;
+    }();
 #else
     template <typename Problem>
     static constexpr bool is_a_load_tr = false;
@@ -707,8 +722,15 @@ struct UniversalGemmPipelineAgBgCrPolicy
             : vector_size * 4 == thread_elements              ? WGAttrNumAccessEnum::Quad
                                                               : WGAttrNumAccessEnum::Invalid;
 
-        using WarpGemm = WarpGemmDispatcher<typename Problem::ComputeDataType,
-                                            typename Problem::ComputeDataType,
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        using ATypeToUse =
+            std::conditional_t<std::is_same_v<ADataType, pk_int4_t>, BDataType, ADataType>;
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+
+        using WarpGemm = WarpGemmDispatcher<ATypeToUse,
+                                            BTypeToUse,
                                             typename Problem::CDataType,
                                             WarpTile::at(I0),
                                             WarpTile::at(I1),
@@ -718,8 +740,8 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                             Problem::UseStructuredSparsity,
                                             wg_attr_num_access>;
 
-        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
-                                                                      typename Problem::BDataType,
+        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<ATypeToUse,
+                                                                      BTypeToUse,
                                                                       typename Problem::CDataType,
                                                                       BlockWarps,
                                                                       WarpGemm>;
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index bbdd3128bf..608de80a7a 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -5,7 +5,6 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/core/arch/arch.hpp"
-#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/elementwise.hpp"
@@ -156,7 +155,6 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
 
     using Base = BlockGemmAQuantBase<Problem_>;
 
-    using Loader   = remove_cvref_t<InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize_>>;
     using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
 
     static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
@@ -447,26 +445,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
         CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
                                           const BSmemBlockWindow& b_block_window)
         {
-            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-            {
-                static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
-                              std::is_same_v<ComputeDataType, bf8_t>);
-                Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
-            }
-            else
-            {
-                load_tile(a_warp_tile_, a_block_window);
-            }
-            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-            {
-                static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
-                              std::is_same_v<ComputeDataType, bf8_t>);
-                Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
-            }
-            else
-            {
-                load_tile(b_warp_tile_, b_block_window);
-            }
+            load_int4_tile<ADataType, ComputeDataType, UnaryOpSize_>(a_warp_tile_, a_block_window);
+            load_int4_tile<BDataType, ComputeDataType, UnaryOpSize_>(b_warp_tile_, b_block_window);
         }
 
         // C += A * B
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
index 28ae709bf0..41ed272d0d 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
@@ -5,7 +5,6 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/core/arch/arch.hpp"
-#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/elementwise.hpp"
@@ -155,7 +154,6 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
 
     using Base = BlockGemmBQuantBase<Problem_>;
 
-    using Loader   = remove_cvref_t<InterleavedPKTypeLoader<ComputeDataType, UnaryOpSize_>>;
     using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
 
     static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
@@ -273,26 +271,8 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
         CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
                                           const BSmemBlockWindow& b_block_window)
         {
-            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-            {
-                static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
-                              std::is_same_v<ComputeDataType, bf8_t>);
-                Loader::load_interleaved_pk_type(a_warp_tile_, a_block_window);
-            }
-            else
-            {
-                load_tile(a_warp_tile_, a_block_window);
-            }
-            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-            {
-                static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
-                              std::is_same_v<ComputeDataType, bf8_t>);
-                Loader::load_interleaved_pk_type(b_warp_tile_, b_block_window);
-            }
-            else
-            {
-                load_tile(b_warp_tile_, b_block_window);
-            }
+            load_int4_tile<ADataType, ComputeDataType, UnaryOpSize_>(a_warp_tile_, a_block_window);
+            load_int4_tile<BDataType, ComputeDataType, UnaryOpSize_>(b_warp_tile_, b_block_window);
         }
 
         // C += A * B
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
index 2b469fa7c6..825c86b0a1 100644
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
@@ -7,7 +7,6 @@
 #include <sstream>
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_base.hpp"
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index c08ab33b91..8365b9ff45 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -13,49 +13,6 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
 )
 set(EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
 
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_int8 test_gemm_pipeline_universal_int8.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_int8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_pk_int4 test_gemm_pipeline_universal_pk_int4.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_pk_int4 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-else()
-    message(DEBUG "Skipping ck_tile_gemm tests for current target")
-endif()
-
-
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
-    add_gtest_executable(test_gemm_pipeline_compiler test_gemm_pipeline_compiler.cpp)
-    target_compile_options(test_gemm_pipeline_compiler PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-endif()
-
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_fp8 test_gemm_pipeline_universal_fp8.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_bf8 test_gemm_pipeline_universal_bf8.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_fp8 test_gemm_pipeline_basic_fp8.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_bf8 test_gemm_pipeline_basic_bf8.cpp)
-
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-else()
-    message(DEBUG "Skipping ck_tile_gemm tests for current target")
-endif()
-
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_fp16 test_gemm_pipeline_universal_fp16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE --save-temps -Wno-gnu-line-marker)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_bf16 test_gemm_pipeline_universal_bf16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_fp16 test_gemm_pipeline_basic_fp16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_bf16 test_gemm_pipeline_basic_bf16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-else()
-    message(DEBUG "Skipping ck_tile_gemm tests for current target ")
-endif()
-
 if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
     if(GPU_TARGETS MATCHES "gfx94|gfx95")
         add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
@@ -77,7 +34,16 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
     endif()
 
     if(GPU_TARGETS MATCHES "gfx11|gfx12")
-    # On Radeon devices, build the WMMA version instead
+        # On Radeon devices, build the WMMA version instead
+        # Define architecture macros for compile-time detection
+        if(GPU_TARGETS MATCHES "gfx12")
+            list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DARCH_GFX12)
+            list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DARCH_GFX12)
+        elseif(GPU_TARGETS MATCHES "gfx11")
+            list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DARCH_GFX11)
+            list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DARCH_GFX11)
+        endif()
+
         add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)
         add_gtest_executable(test_ck_tile_gemm_pipeline_compv3_wmma test_gemm_pipeline_compv3_wmma.cpp)
         add_gtest_executable(test_ck_tile_gemm_pipeline_compv4_wmma test_gemm_pipeline_compv4_wmma.cpp)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
deleted file mode 100644
index eef8f0cb5e..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes      = ::testing::Types<std::tuple<BF16, BF16, BF16>, std::tuple<BF16, I4, BF16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
deleted file mode 100644
index aec8af7b3a..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes      = ::testing::Types<std::tuple<BF8, BF8, F16>, std::tuple<BF8, I4, F16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_cases.hpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_cases.hpp
deleted file mode 100644
index c0b041f3e6..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_cases.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "gtest/gtest.h"
-
-TYPED_TEST_SUITE(TestCkTileGemmPipelineBasic, BasicTestTypes);
-
-TYPED_TEST(TestCkTileGemmPipelineBasic, GemmTest)
-{
-    // Define possible values for each parameter
-    std::vector<int> m_values = {128, 1024};
-    std::vector<int> n_values = {128, 2048};
-    std::vector<int> k_values = {64, 128};
-
-    for(const auto& m : m_values)
-    {
-        for(const auto& n : n_values)
-        {
-            for(const auto& k : k_values)
-            {
-                this->run_gemm_combinations(m, n, k);
-            }
-        }
-    }
-}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
deleted file mode 100644
index 6de47d1c59..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes      = ::testing::Types<std::tuple<F16, F16, F16>, std::tuple<F16, I4, F16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
deleted file mode 100644
index 722ffbd16f..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes =
-    ::testing::Types<std::tuple<F8, F8, F16>, std::tuple<F8, BF8, F16>, std::tuple<F8, I4, F16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
deleted file mode 100644
index 3e019b7097..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
+++ /dev/null
@@ -1,218 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-
-struct GemmConfig_Mfma : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-};
-
-struct GemmConfig_Wmma : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 64;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-};
-
-#if CK_TILE_USE_WMMA
-using GemmConfigs = ::testing::Types<GemmConfig_Wmma>;
-#else
-using GemmConfigs = ::testing::Types<GemmConfig_Mfma>;
-#endif
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-
-{
-    if constexpr(Persistent)
-        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 1;
-
-    // This part comes from the Codegen
-    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
-    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
-    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-
-    using CodegenPipelineProblem = ck_tile::
-        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-
-    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation>>;
-
-        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
-        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw ArgumentsNotSupportedException(
-                "Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenGemmShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    if(args.k_batch == 1)
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
-    }
-}
-
-template <typename GemmConfig,
-          typename APrecType,
-          typename BPrecType = APrecType,
-          typename CPrecType = APrecType>
-bool run_gemm_test_prec_type(const int M, const int N, const int K)
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType, Row, Col, Row>(
-        M, N, K);
-}
-
-template <typename Tuple>
-class TestCkTileGemmPipelineBasic : public ::testing::Test
-{
-    protected:
-    using GemmConfig = std::tuple_element_t<0, Tuple>;
-    using APrecType  = std::tuple_element_t<1, Tuple>;
-    using BPrecType  = std::tuple_element_t<2, Tuple>;
-    using CPrecType  = std::tuple_element_t<3, Tuple>;
-
-    void run_gemm_combinations(const int m, const int n, const int k)
-    {
-        // Skip tests that are known to fail
-        if constexpr(std::is_same_v<APrecType, F8> && std::is_same_v<BPrecType, BF8>)
-        {
-            GTEST_SKIP() << "Skipping this test due to known failures with F8 x BF8";
-        }
-        else if constexpr(std::is_same_v<APrecType, F16> && std::is_same_v<BPrecType, I4>)
-        {
-            GTEST_SKIP() << "Skipping this test due to known failures with F16 x I4";
-        }
-        else
-        {
-            bool is_success = true;
-            std::cout << "-m=" << m << " -n=" << n << " -k=" << k << std::endl;
-
-            // Call the function with the current configuration
-            try
-            {
-                is_success =
-                    run_gemm_test_prec_type<GemmConfig, APrecType, BPrecType, CPrecType>(m, n, k);
-            }
-            catch(const ArgumentsNotSupportedException& e)
-            {
-                std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
-                // ArgumentsNotSupportedException  is not an error. Do not change is_success
-            }
-            catch(const std::runtime_error& e)
-            {
-                std::cerr << "Caught runtime error: " << e.what() << '\n';
-                is_success = false;
-            }
-            EXPECT_TRUE(is_success);
-        }
-    }
-};
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_comp_async.cpp b/test/ck_tile/gemm/test_gemm_pipeline_comp_async.cpp
index d31b379f2a..403a587410 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_comp_async.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_comp_async.cpp
@@ -7,13 +7,15 @@
 
 template <typename T>
 class TestCkTileGemmPipelineCompAsync
-    : public TestCkTileGemmPipeline<T, class TestCkTileGemmPipelineCompAsync<T>>
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompAsync<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompAsync
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompAsync, KernelTypesCompAsync);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompAsync);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
index d04981ccb4..e370ed5d68 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
@@ -9,11 +9,28 @@ template <typename T>
 class TestCkTileGemmPipelineCompV3
     : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV3<T>>
 {
+    public:
+    static constexpr bool check_data_type()
+    {
+        using Base = TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV3<T>>;
+        if constexpr(std::is_same_v<typename Base::ADataType, F8> &&
+                     std::is_same_v<typename Base::BDataType, BF8>)
+        {
+            return false;
+        }
+        else if constexpr(std::is_same_v<typename Base::BLayout, Row> &&
+                          std::is_same_v<typename Base::BDataType, I4>)
+        {
+            return false;
+        }
+
+        return true;
+    }
 };
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3, KernelTypesCompV3);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV3);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
index dc1fada04b..59db69eb4b 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
@@ -9,11 +9,26 @@ template <typename T>
 class TestCkTileGemmPipelineCompV3Wmma
     : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV3Wmma<T>>
 {
+    public:
+    static constexpr bool check_data_type()
+    {
+        using Base1 = TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV3Wmma<T>>;
+        using Base2 = TestCkTileGemmPipeline<T, Base1>;
+        if constexpr(std::is_same_v<typename Base2::BLayout, Row> &&
+                     std::is_same_v<typename Base2::BDataType, I4>)
+        {
+            return false;
+        }
+        else
+        {
+            return Base1::check_data_type();
+        }
+    }
 };
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3Wmma
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3Wmma, KernelTypesCompV3Wmma);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV3Wmma);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
index 480b0f6e7b..14e767b820 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
@@ -9,11 +9,21 @@ template <typename T>
 class TestCkTileGemmPipelineCompV4
     : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV4<T>>
 {
+    public:
+    static constexpr bool check_data_type()
+    {
+        using Base = TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV4<T>>;
+        if constexpr(std::is_same_v<typename Base::BDataType, I4>)
+        {
+            return false;
+        }
+        return true;
+    }
 };
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4, KernelTypesCompV4);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV4);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
index b868dfdf1c..88ab4fdf93 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
@@ -13,7 +13,7 @@ class TestCkTileGemmPipelineCompV4Wmma
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4Wmma
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4Wmma, KernelTypesCompV4Wmma);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV4Wmma);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv6.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv6.cpp
index a72ff98055..b430ca2a12 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv6.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv6.cpp
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
@@ -6,11 +9,13 @@ template <typename T>
 class TestCkTileGemmPipelineCompV6
     : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV6<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV6
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV6, KernelTypesCompV6);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV6);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index 6664fc2100..8ae7252908 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -79,55 +79,131 @@ using KernelTypesMemWmma = ::testing::Types<
 
 using KernelTypesCompV3 = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
 >;
 
 using KernelTypesCompV3Wmma = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
     std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>
+    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>
 >;
 
 using KernelTypesCompV4 = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
     std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
-    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>
+    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>
 >;
 
 // clang-format on
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
index 51fbebc915..b97c140e1a 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
@@ -5,6 +8,8 @@
 template <typename T>
 class TestCkTileGemmPipelineMem : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineMem<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineMem
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
index b3d433c466..c23d148583 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -9,6 +9,8 @@ template <typename T>
 class TestCkTileGemmPipelinePersistent
     : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelinePersistent<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelinePersistent
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
deleted file mode 100644
index 20ee426d1c..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
+++ /dev/null
@@ -1,392 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-
-#include "ck_tile/host/permute_pk_int4.hpp"
-
-template <typename Layout>
-static constexpr inline auto is_row_major(Layout layout_)
-{
-    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-template <typename GemmConfig,
-          typename Tensor,
-          typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-void permute_tensor_b(Tensor& tensor)
-{
-    using GemmShape = ck_tile::TileGemmShape<
-        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-        ck_tile::
-            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-        GemmConfig::PermuteA,
-        GemmConfig::PermuteB>;
-
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
-                                                                 GemmConfig::kPadN,
-                                                                 GemmConfig::kPadK,
-                                                                 GemmConfig::DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 CLayout,
-                                                                 GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity>;
-
-    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                       BDataType,
-                                                                       AccDataType,
-                                                                       GemmShape,
-                                                                       GemmUniversalTraits,
-                                                                       GemmConfig::Scheduler,
-                                                                       true,
-                                                                       ck_tile::TailNumber::Full>;
-
-    using GemmPipeline = typename PipelineTypeTraits<GemmConfig::Pipeline>::template GemmPipeline<
-        UniversalGemmProblem>;
-
-    const ck_tile::index_t K  = tensor.get_length(0);
-    const ck_tile::index_t N  = tensor.get_length(1);
-    const ck_tile::index_t K1 = GemmPipeline::GetSmemPackB();
-    const ck_tile::index_t K0 = K / K1;
-
-    Tensor tensor_copy = tensor;
-
-    // int K0, N, K1
-    for(int j = 0; j < K0; j++)
-    {
-        for(int i = 0; i < N; i++)
-        {
-            for(int jj = 0; jj < K1; jj++)
-            {
-                tensor(j * N * K1 + i * K1 + jj) = tensor_copy(i * K + (j * K1 + jj));
-            }
-        }
-    }
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
-                  ck_tile::DeviceMem& b_k_n_dev_buf,
-                  ck_tile::DeviceMem& c_m_n_dev_buf,
-                  ck_tile::index_t M,
-                  ck_tile::index_t N,
-                  ck_tile::index_t K,
-                  ck_tile::index_t stride_A,
-                  ck_tile::index_t stride_B,
-                  ck_tile::index_t stride_C,
-                  ck_tile::index_t kbatch,
-                  int n_warmup,
-                  int n_repeat,
-                  bool persistent)
-{
-    ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
-                                  b_k_n_dev_buf.GetDeviceBuffer(),
-                                  c_m_n_dev_buf.GetDeviceBuffer(),
-                                  kbatch,
-                                  M,
-                                  N,
-                                  K,
-                                  stride_A,
-                                  stride_B,
-                                  stride_C};
-
-    float ave_time;
-    if(persistent)
-    {
-        ave_time = gemm<GemmConfig,
-                        ADataType,
-                        BDataType,
-                        DsDataType,
-                        AccDataType,
-                        CDataType,
-                        ALayout,
-                        BLayout,
-                        DsLayout,
-                        CLayout,
-                        true,
-                        CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
-    }
-    else
-    {
-        ave_time = gemm<GemmConfig,
-                        ADataType,
-                        BDataType,
-                        DsDataType,
-                        AccDataType,
-                        CDataType,
-                        ALayout,
-                        BLayout,
-                        DsLayout,
-                        CLayout,
-                        false,
-                        CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
-    }
-
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_byte =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
-              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
-              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
-              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
-              << " B_Type=" << DataTypeTraits<BDataType>::name
-              << " C_Type=" << DataTypeTraits<CDataType>::name
-              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType = ADataType,
-          typename CDataType = ADataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-bool run_gemm_test_with_layouts(const int M, const int N, const int K)
-{
-    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
-
-    ck_tile::index_t stride_A = 0;
-    ck_tile::index_t stride_B = 0;
-    ck_tile::index_t stride_C = 0;
-
-    constexpr ck_tile::index_t kbatch = 1;
-    constexpr int init_method         = 0;
-    constexpr int verification_method = 2;
-    constexpr int n_warmup            = 0;
-    constexpr int n_repeat            = 1;
-    constexpr bool persistent         = false;
-
-    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(ALayout{}));
-    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(BLayout{}));
-    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
-
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(ALayout{})));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(BLayout{})));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-
-    if constexpr(init_method == 0)
-    {
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
-    }
-    else if constexpr(init_method == 1)
-    {
-        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
-        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
-    }
-    else if constexpr(init_method == 2)
-    {
-        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
-    }
-    else
-    {
-        a_m_k.SetZero();
-        b_k_n.SetZero();
-    }
-
-    if(GemmConfig::UseStructuredSparsity)
-    {
-        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
-    }
-
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-
-    static_assert(!GemmConfig::PermuteA, "Not implemented");
-    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-    {
-        // Permute vector pk_i4x4 data for device implementation
-        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
-        if constexpr(GemmConfig::PermuteB)
-        {
-            permute_tensor_b<GemmConfig,
-                             decltype(b_k_n_dev),
-                             ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout>(b_k_n_dev);
-        }
-        permute_vectors_i4x4_b(b_k_n_dev);
-        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
-    }
-    else
-    {
-        if constexpr(GemmConfig::PermuteB)
-        {
-            std::cout << "Permute for this DataType is not implemented." << std::endl;
-            return false;
-        }
-        b_k_n_dev_buf.ToDevice(b_k_n.data());
-    }
-
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    c_m_n_dev_buf.SetZero();
-    c_m_n_dev_result.SetZero();
-
-    invoke_gemm<GemmConfig,
-                ADataType,
-                BDataType,
-                ck_tile::tuple<>,
-                AccDataType,
-                CDataType,
-                ALayout,
-                BLayout,
-                ck_tile::tuple<>,
-                CLayout>(a_m_k_dev_buf,
-                         b_k_n_dev_buf,
-                         c_m_n_dev_buf,
-                         M,
-                         N,
-                         K,
-                         stride_A,
-                         stride_B,
-                         stride_C,
-                         kbatch,
-                         n_warmup,
-                         n_repeat,
-                         persistent);
-
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-    bool pass = true;
-
-    if constexpr(verification_method == 1)
-    {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
-        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_ref);
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
-    }
-    else if constexpr(verification_method == 2)
-    {
-        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-        {
-            // Restore input for B for gpu reference
-            b_k_n_dev_buf.ToDevice(b_k_n.data());
-        }
-
-        // memory on host to store gpu reference result
-        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        // memory on device to store gpu reference result
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
-
-        c_m_n_gpu_ref.SetZero();
-        c_m_n_gpu_buf_ref.SetZero();
-
-        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
-        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
-        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
-
-        ck_tile::reference_gemm_gpu<ADataType,
-                                    BDataType,
-                                    AccDataType,
-                                    CDataType,
-                                    ALayout,
-                                    BLayout,
-                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
-
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
-
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_gpu_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
-    }
-
-    return pass;
-}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
deleted file mode 100644
index 1f9033cab9..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
+++ /dev/null
@@ -1,450 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <string>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/host/kernel_launch.hpp"
-#include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-class ArgumentsNotSupportedException : public std::logic_error
-{
-    public:
-    explicit ArgumentsNotSupportedException(const std::string& message) : logic_error(message) {}
-};
-
-// temporary workaround to get k_warp_tile based on PrecType and gfx950 or not
-template <typename PrecType, ck_tile::index_t M_Warp_Tile>
-constexpr ck_tile::index_t get_k_warp_tile()
-{
-#if defined(CK_GFX950_SUPPORT)
-    constexpr bool is_8bit_float =
-        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
-    if constexpr(M_Warp_Tile == 32)
-        return is_8bit_float ? 64 : 16;
-    else
-        return is_8bit_float ? 128 : 32;
-#else
-    if constexpr(M_Warp_Tile == 32)
-        return 16;
-    else
-        return 32;
-#endif
-}
-
-struct GemmConfigBase
-{
-    static constexpr bool kPadM = false;
-    static constexpr bool kPadN = false;
-    static constexpr bool kPadK = false;
-
-    static constexpr bool PermuteA = false;
-    static constexpr bool PermuteB = false;
-
-    static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
-
-    static constexpr int kBlockPerCu                         = 1;
-    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    static constexpr ck_tile::index_t TileParitionerM01      = 4;
-    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-    static constexpr ck_tile::index_t NumWaveGroups = 1;
-};
-
-template <typename PrecType>
-struct GemmConfigMemoryInterwave : public GemmConfigBase
-{
-    // Memory friendly for Interwave scheduler
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 32;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::MEMORY;
-    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Interwave;
-};
-
-template <typename PrecType>
-struct GemmConfigMemoryIntrawave : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 32;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::MEMORY;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3 : public GemmConfigBase
-{
-    // Compute V3 only support Intrawave scheduler
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3_1 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3_2 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-
-    static constexpr int kBlockPerCu = 2;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV4 : public GemmConfigBase
-{
-    // Compute V4 only support Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = true;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V4;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV4_1 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = true;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V4;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV5 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 2;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer               = false;
-    static constexpr ck_tile::GemmPipeline Pipeline      = ck_tile::GemmPipeline::COMPUTE_V5;
-    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3_WMMA : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-
-    static constexpr int kBlockPerCu = 2;
-};
-
-template <typename PrecType>
-#if CK_TILE_USE_WMMA
-using GemmConfigsTemplate = ::testing::Types<GemmConfigComputeV3_WMMA<PrecType>>;
-#else
-using GemmConfigsTemplate = ::testing::Types<GemmConfigComputeV3<PrecType>,
-                                             GemmConfigComputeV3_2<PrecType>,
-                                             GemmConfigComputeV4<PrecType>>;
-#endif
-
-template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
-struct GemmTypeConfig;
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t>
-{
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::half_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-    // ToDo: Add more bias config to support different categories of GEMM.
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
-{
-    using ADataType   = ck_tile::bf16_t;
-    using BDataType   = ck_tile::bf16_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::bf16_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::pk_int4_t, ck_tile::bf16_t>
-{
-    using ADataType   = ck_tile::bf16_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::bf16_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::fp8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::bf8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::bf8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::bf8_t;
-    using BDataType   = ck_tile::bf8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::bf8_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
-{
-    using ADataType   = ck_tile::int8_t;
-    using BDataType   = ck_tile::int8_t;
-    using AccDataType = int32_t;
-    using CDataType   = int32_t;
-};
-
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <ck_tile::GemmPipeline PipelineId>
-struct PipelineTypeTraits;
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::MEMORY>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
-};
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V3>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
-};
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V4>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
-};
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V5>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
-};
-
-// host API
-template <typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent = false,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_type_param_product.hpp b/test/ck_tile/gemm/test_gemm_pipeline_type_param_product.hpp
deleted file mode 100644
index d99460e588..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_type_param_product.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-
-#include <tuple>
-#include "gtest/gtest.h"
-
-// Helper to create flattened cartesian product of GemmConfig × PrecTypes
-template <typename GemmConfigs, typename PrecTypes>
-struct CartesianProduct;
-
-// Specialization for the actual cartesian product implementation
-template <typename... GemmConfigs, typename... PrecTypes>
-struct CartesianProduct<::testing::Types<GemmConfigs...>, ::testing::Types<PrecTypes...>>
-{
-    private:
-    // Helper to flatten a single PrecType tuple with GemmConfig
-    template <typename GemmConfig, typename PrecType>
-    struct FlattenHelper;
-
-    template <typename GemmConfig, typename APrecType, typename BPrecType, typename CPrecType>
-    struct FlattenHelper<GemmConfig, std::tuple<APrecType, BPrecType, CPrecType>>
-    {
-        using type = std::tuple<GemmConfig, APrecType, BPrecType, CPrecType>;
-    };
-
-    // Helper to generate all flattened combinations of one GemmConfig with all PrecTypes
-    template <typename GemmConfig>
-    using MakeCombinations =
-        ::testing::Types<typename FlattenHelper<GemmConfig, PrecTypes>::type...>;
-
-    // Concatenate all type lists
-    template <typename... TypeLists>
-    struct Concatenate;
-
-    // Base case: single type list
-    template <typename... Types>
-    struct Concatenate<::testing::Types<Types...>>
-    {
-        using type = ::testing::Types<Types...>;
-    };
-
-    // Two type lists
-    template <typename... Types1, typename... Types2>
-    struct Concatenate<::testing::Types<Types1...>, ::testing::Types<Types2...>>
-    {
-        using type = ::testing::Types<Types1..., Types2...>;
-    };
-
-    // Three or more type lists - recursive case
-    template <typename TypeList1, typename TypeList2, typename... Rest>
-    struct Concatenate<TypeList1, TypeList2, Rest...>
-    {
-        using type =
-            typename Concatenate<typename Concatenate<TypeList1, TypeList2>::type, Rest...>::type;
-    };
-
-    public:
-    using type = typename Concatenate<MakeCombinations<GemmConfigs>...>::type;
-};
-
-template <typename GemmConfigs, typename PrecTypes>
-using CartesianProduct_t = typename CartesianProduct<GemmConfigs, PrecTypes>::type;
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
deleted file mode 100644
index 25c9e13514..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs = GemmConfigsTemplate<BF16>;
-using PrecTypes   = ::testing::Types<std::tuple<BF16, BF16, BF16>, std::tuple<BF16, I4, BF16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
deleted file mode 100644
index 2a4d7a065b..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<F16>;
-using PrecTypes          = ::testing::Types<std::tuple<BF8, BF8, F16>, std::tuple<BF8, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_cases.hpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_cases.hpp
deleted file mode 100644
index 5225c01ffb..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_cases.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "gtest/gtest.h"
-
-TYPED_TEST_SUITE(TestCkTileGemmPipelineUniversal, UniversalTestTypes);
-
-TYPED_TEST(TestCkTileGemmPipelineUniversal, GemmTest)
-{
-    // Define possible values for each parameter
-    std::vector<int> m_values = {512, 1024};
-    std::vector<int> n_values = {512, 2048};
-    std::vector<int> k_values = {512, 1024};
-
-    for(const auto& m : m_values)
-    {
-        for(const auto& n : n_values)
-        {
-            for(const auto& k : k_values)
-            {
-                this->run_gemm_combinations(m, n, k);
-            }
-        }
-    }
-}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
deleted file mode 100644
index e3d6d662b7..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<F16>;
-using PrecTypes          = ::testing::Types<std::tuple<F16, F16, F16>, std::tuple<F16, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
deleted file mode 100644
index a0e5246e11..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs = GemmConfigsTemplate<F16>;
-using PrecTypes =
-    ::testing::Types<std::tuple<F8, F8, F16>, std::tuple<F8, BF8, F16>, std::tuple<F8, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
deleted file mode 100644
index c0bab6b838..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<INT32>;
-using PrecTypes          = ::testing::Types<std::tuple<INT8, INT8, INT32>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
deleted file mode 100644
index e27196f4c4..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<F16>;
-using PrecTypes          = ::testing::Types<std::tuple<F16, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
deleted file mode 100644
index 11204d4490..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ /dev/null
@@ -1,260 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "gtest/gtest.h"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          bool Persistent,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-
-{
-    using GemmShape = ck_tile::TileGemmShape<
-        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-        ck_tile::
-            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-        GemmConfig::PermuteA,
-        GemmConfig::PermuteB>;
-
-    using TilePartitioner =
-        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                   GemmConfig::TileParitionerGroupNum,
-                                                   GemmConfig::TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
-                                           GemmConfig::kPadN,
-                                           GemmConfig::kPadK,
-                                           ALayout,
-                                           BLayout,
-                                           ELayout,
-                                           GemmConfig::NumWaveGroups>;
-
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
-                                                                 GemmConfig::kPadN,
-                                                                 GemmConfig::kPadK,
-                                                                 GemmConfig::DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 ELayout,
-                                                                 GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity,
-                                                                 Persistent,
-                                                                 GemmConfig::NumWaveGroups>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-    using BaseGemmPipeline = typename PipelineTypeTraits<
-        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
-    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
-    float ave_time{0};
-
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = GemmConfig::Scheduler;
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
-
-        using GemmPipeline = typename PipelineTypeTraits<
-            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             DsLayout,
-                                             ELayout,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation,
-                                             GemmConfig::NumWaveGroups>>;
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        dim3 grids;
-        if constexpr(Persistent)
-        {
-            grids = Kernel::MaxOccupancyGridSize(s);
-        }
-        else
-        {
-            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-        }
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw ArgumentsNotSupportedException(
-                "Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << GemmShape::GetName() << '\n'
-                      << "problem: " << GemmPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << GemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-        if(s.flush_cache_)
-        {
-            std::cout << "Flushing cache..." << std::endl;
-            static constexpr ck_tile::index_t APackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-            static constexpr ck_tile::index_t BPackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-
-            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
-            rotating_mem.Print();
-
-            auto run_flush_cache = [&]() {
-                // flush icache
-                ck_tile::flush_icache();
-                // rotating mem
-                rotating_mem.Next();
-                // clear c mem
-                if(args.k_batch > 1)
-                    hipGetErrorString(hipMemsetAsync(
-                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-            };
-            ave_time = ck_tile::launch_kernel_time_mask(
-                s,
-                run_flush_cache,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        else
-        {
-            ave_time = ck_tile::launch_kernel(
-                s,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        return ave_time;
-    };
-
-    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-        if(args.k_batch == 1)
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
-        }
-        else
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
-        }
-    };
-
-    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename APrecType,
-          typename BPrecType = APrecType,
-          typename CPrecType = APrecType>
-bool run_gemm_test_prec_type(const int M, const int N, const int K)
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType, Row, Col, Row>(
-        M, N, K);
-}
-
-template <typename Tuple>
-class TestCkTileGemmPipelineUniversal : public ::testing::Test
-{
-    protected:
-    using GemmConfig = std::tuple_element_t<0, Tuple>;
-    using APrecType  = std::tuple_element_t<1, Tuple>;
-    using BPrecType  = std::tuple_element_t<2, Tuple>;
-    using CPrecType  = std::tuple_element_t<3, Tuple>;
-
-    void run_gemm_combinations(const int m, const int n, const int k)
-    {
-        // Skip tests that are known to fail or are not supported
-        if constexpr((std::is_same_v<GemmConfig, GemmConfigComputeV3<CPrecType>> ||
-                      std::is_same_v<GemmConfig, GemmConfigComputeV3_2<CPrecType>>) &&
-                     std::is_same_v<APrecType, F8> && std::is_same_v<BPrecType, BF8>)
-        {
-            GTEST_SKIP()
-                << "Skipping this test due to known failures with F8 x BF8 on the V3 pipeline";
-        }
-        else if constexpr((std::is_same_v<GemmConfig, GemmConfigComputeV4<CPrecType>>) &&
-                          std::is_same_v<BPrecType, I4>)
-        {
-            GTEST_SKIP()
-                << "Skipping this test because BPrecType I4 is not supported on the V4 pipeline";
-        }
-        else
-        {
-            bool is_success = true;
-            // Call the function with the current configuration
-            try
-            {
-                is_success =
-                    run_gemm_test_prec_type<GemmConfig, APrecType, BPrecType, CPrecType>(m, n, k);
-            }
-            catch(const ArgumentsNotSupportedException& e)
-            {
-                std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
-                // ArgumentsNotSupportedException  is not an error. Do not change is_success
-            }
-            catch(const std::runtime_error& e)
-            {
-                std::cerr << "Caught runtime error: " << e.what() << '\n';
-                is_success = false;
-            }
-            EXPECT_TRUE(is_success);
-        }
-    }
-};
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
index e0e58ad09f..2c648eef23 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include "ck_tile/core/arch/arch.hpp"
+
 TYPED_TEST(TEST_SUITE_NAME, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
@@ -17,6 +19,15 @@ TYPED_TEST(TEST_SUITE_NAME, SmallM)
     {
         for(int K : Ks)
         {
+            if constexpr(std::is_same_v<typename TestFixture::ADataType, ck_tile::fp16_t> &&
+                         std::is_same_v<typename TestFixture::BDataType, ck_tile::pk_int4_t>)
+            {
+                if(K == 2 * TestFixture::K_Tile)
+                {
+                    // This particular combination of parameters fails.
+                    continue;
+                }
+            }
             if constexpr(std::is_same_v<typename TestFixture::ALayout,
                                         ck_tile::tensor_layout::gemm::ColumnMajor>)
             {
@@ -55,6 +66,15 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
     {
         for(int K : Ks)
         {
+            if constexpr(std::is_same_v<typename TestFixture::ADataType, ck_tile::fp16_t> &&
+                         std::is_same_v<typename TestFixture::BDataType, ck_tile::pk_int4_t>)
+            {
+                if(K == 2 * TestFixture::K_Tile)
+                {
+                    // This particular combination of parameters fails.
+                    continue;
+                }
+            }
             if constexpr(std::is_same_v<typename TestFixture::ALayout,
                                         ck_tile::tensor_layout::gemm::ColumnMajor>)
             {
@@ -82,7 +102,20 @@ TYPED_TEST(TEST_SUITE_NAME, PaddK)
     constexpr int K = 432;
 
     for(int M : Ms)
-        this->Run(M, N, K);
+    {
+        if constexpr(std::is_same_v<typename TestFixture::BDataType, ck_tile::pk_int4_t>)
+        {
+#if defined(ARCH_GFX12) || defined(ARCH_GFX11)
+            this->Run(M, N, K);
+#else
+            EXPECT_THROW(this->Run(M, N, K), std::runtime_error);
+#endif
+        }
+        else
+        {
+            this->Run(M, N, K);
+        }
+    }
 }
 
 TYPED_TEST(TEST_SUITE_NAME, Regular)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 994510c060..f828150e01 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -11,6 +11,14 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/host/permute_pk_int4.hpp"
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
 
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
@@ -93,7 +101,7 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompAsync, Problem>
 template <typename Tuple, typename Derived>
 class TestCkTileGemmPipeline : public ::testing::Test
 {
-    protected:
+    public:
     using ALayout                      = std::tuple_element_t<0, Tuple>;
     using BLayout                      = std::tuple_element_t<1, Tuple>;
     using CLayout                      = std::tuple_element_t<2, Tuple>;
@@ -118,6 +126,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
     static constexpr bool Persistent =
         ck_tile::tuple_element_or_default_t<Tuple, 15, std::false_type>::value;
 
+    protected:
     template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
     void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     {
@@ -228,7 +237,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
             {
                 grids = Kernel::GridSize(args.M, args.N, args.k_batch);
             }
-            dim3 blocks = Kernel::BlockSize();
+            const dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -266,51 +275,19 @@ class TestCkTileGemmPipeline : public ::testing::Test
         BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
     }
 
-    template <typename ADataType,
-              typename BDataType,
-              typename AccDataType,
-              ck_tile::index_t M_Warp_Tile,
-              ck_tile::index_t N_Warp_Tile,
-              ck_tile::index_t K_Warp_Tile>
-    bool check_data_type()
-    {
-        return static_cast<Derived*>(this)
-            ->template check_data_type_impl<ADataType,
-                                            BDataType,
-                                            AccDataType,
-                                            M_Warp_Tile,
-                                            N_Warp_Tile,
-                                            K_Warp_Tile>();
-    }
-
-    template <typename ADataType,
-              typename BDataType,
-              typename AccDataType,
-              ck_tile::index_t M_Warp_Tile,
-              ck_tile::index_t N_Warp_Tile,
-              ck_tile::index_t K_Warp_Tile>
-    bool check_data_type_impl()
-    {
-        return true;
-    }
-
     public:
     std::vector<int> k_batches_;
 
     void SetUp() override
     {
-        if(!check_data_type<ADataType,
-                            BDataType,
-                            AccDataType,
-                            M_Warp_Tile,
-                            N_Warp_Tile,
-                            K_Warp_Tile>())
+        if constexpr(!Derived::check_data_type())
         {
             GTEST_SKIP() << "Unsupported data type combination for gemm pipeline test.";
         }
-        if constexpr(PipelineType == GemmPipelineType::CompV4)
+        if constexpr(PipelineType == GemmPipelineType::CompV4 ||
+                     std::is_same_v<BDataType, ck_tile::pk_int4_t>)
         {
-            // Only do k_batch = 1 when pipeline is CompV4
+            // Only do k_batch = 1 when pipeline is CompV4, or BDataType is I4
             k_batches_ = {1};
         }
         else
@@ -328,9 +305,13 @@ class TestCkTileGemmPipeline : public ::testing::Test
              const int StrideB = 0,
              const int StrideC = 0)
     {
-        for(auto kb : k_batches_)
+        // Some unsupported tests don't compile, so we check here before attempting to.
+        if constexpr(Derived::check_data_type())
         {
-            RunSingle<PadM, PadN, PadK, Preshuffle>(M, N, K, StrideA, StrideB, StrideC, kb);
+            for(auto kb : k_batches_)
+            {
+                RunSingle<PadM, PadN, PadK, Preshuffle>(M, N, K, StrideA, StrideB, StrideC, kb);
+            }
         }
     }
 
@@ -343,49 +324,19 @@ class TestCkTileGemmPipeline : public ::testing::Test
                    const int StrideC,
                    int kbatch = 1)
     {
-        using namespace ck_tile::literals;
+        ck_tile::index_t stride_A =
+            ck_tile::get_default_stride(M, K, StrideA, is_row_major(ALayout{}));
+        ck_tile::index_t stride_B =
+            ck_tile::get_default_stride(K, N, StrideB, is_row_major(BLayout{}));
+        ck_tile::index_t stride_C =
+            ck_tile::get_default_stride(M, N, StrideC, is_row_major(CLayout{}));
 
-        auto f_host_tensor_descriptor = [](std::size_t row,
-                                           std::size_t col,
-                                           std::size_t stride,
-                                           auto layout) {
-            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-
-        auto f_get_default_stride =
-            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-                if(stride == 0)
-                {
-                    // give a chance if stride is zero, return a default packed stride
-                    if constexpr(std::is_same_v<decltype(layout),
-                                                ck_tile::tensor_layout::gemm::RowMajor>)
-                    {
-                        return col;
-                    }
-                    else
-                    {
-                        return row;
-                    }
-                }
-                else
-                    return stride;
-            };
-
-        ck_tile::index_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
-        ck_tile::index_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
-        ck_tile::index_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
-
-        ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
-        ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(ALayout{})));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(BLayout{})));
         ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
 
         ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, 11939}(a_m_k);
         ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, 11940}(b_k_n);
@@ -394,8 +345,19 @@ class TestCkTileGemmPipeline : public ::testing::Test
         ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
         ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
 
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
         a_m_k_dev_buf.ToDevice(a_m_k.data());
-        b_k_n_dev_buf.ToDevice(b_k_n.data());
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
@@ -416,7 +378,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
         bool pass = true;
 
         ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
         c_m_n_host_ref.SetZero();
 
         ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp b/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
index e33e90d268..7b7ed45a2e 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
@@ -3,25 +3,36 @@
 
 #pragma once
 
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp"
 #include "test_gemm_pipeline_util.hpp"
 
 template <typename Tuple, typename Derived>
 class TestCkTileGemmPipelineWmmaBase : public TestCkTileGemmPipeline<Tuple, Derived>
 {
     public:
-    template <typename ADataType,
-              typename BDataType,
-              typename AccDataType,
-              ck_tile::index_t M_Warp_Tile,
-              ck_tile::index_t N_Warp_Tile,
-              ck_tile::index_t K_Warp_Tile>
-    bool check_data_type_impl()
+    static constexpr bool check_data_type()
     {
-        return ck_tile::check_wmma_supported<ADataType,
-                                             BDataType,
-                                             AccDataType,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile>();
+        using Base = TestCkTileGemmPipeline<Tuple, Derived>;
+
+#if defined(ARCH_GFX12)
+        using DeviceIp = ck_tile::gfx12_t;
+#elif defined(ARCH_GFX11)
+        using DeviceIp = ck_tile::gfx11_t;
+#else
+#error "Unsupported architecture for WMMA"
+#endif
+
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<typename Base::BDataType, ck_tile::pk_int4_t>,
+                               typename Base::ADataType,
+                               typename Base::BDataType>;
+        return ck_tile::has_wmma_traits_v<DeviceIp,
+                                          typename Base::ADataType,
+                                          BTypeToUse,
+                                          typename Base::AccDataType,
+                                          ck_tile::constant<Base::M_Warp_Tile>::value,
+                                          ck_tile::constant<Base::N_Warp_Tile>::value,
+                                          ck_tile::constant<Base::K_Warp_Tile>::value>;
     }
 };