Remove "basic" and universal GEMM tests, and incorporate their test cases into the GEMM pipeline tests (#3094)

* Add missing copyright statements * Use ck_tile::host_tensor_descriptor instead of a custom lambda * Refactor use of check_data_type in test classes * Use TEST_SUITE_NAME with TYPED_TEST_SUITE * Remove an unused namespace * Make dim3 const * Add BF8 x BF8 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp * Add F8 x BF8 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp * Add BF16 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp * Add BF16 x BF16 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp * Add BF8 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp * Add F8 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp * Add F16 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp * Skip failing tests of F16 x I4 for CompV3 with K == 2 * K_Tile * Add missing precision type combinations to CompV4 from CompV3 * Move the INT8 tests around for consistency with KernelTypesCompV3Wmma * Add missing precision type combinations to CompV3Wmma from CompV3 * Remove the basic and universal tests and their dependencies * On __gfx950__, avoid using transposed loading of A with datatype pk_int4_t of B * Use ADataType and BDataType instead of ComputeDataType for WarpGemm * Explicitly set some return types to void * Use more general typenames in InterleavedPKTypeLoader * Add load_interleaved_pk_type.hpp to common.hpp * Use std::is_same_v in load_int4_tile * Add handling of LoadTranspose to load_int4_tile * Factor out common code in several places using load_int4_tile * Add support for pk_int4_t using load_int4_tile * Fix formatting
2026-04-19 22:39:03 +00:00 · 2025-11-13 21:01:27 +02:00
parent 7d57bc169f
commit f2cfc6b94e
38 changed files with 352 additions and 1888 deletions
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -13,49 +13,6 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
 )
 set(EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})

-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_int8 test_gemm_pipeline_universal_int8.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_int8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_pk_int4 test_gemm_pipeline_universal_pk_int4.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_pk_int4 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-else()
-    message(DEBUG "Skipping ck_tile_gemm tests for current target")
-endif()
-
-
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
-    add_gtest_executable(test_gemm_pipeline_compiler test_gemm_pipeline_compiler.cpp)
-    target_compile_options(test_gemm_pipeline_compiler PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-endif()
-
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_fp8 test_gemm_pipeline_universal_fp8.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_bf8 test_gemm_pipeline_universal_bf8.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_fp8 test_gemm_pipeline_basic_fp8.cpp)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_bf8 test_gemm_pipeline_basic_bf8.cpp)
-
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-else()
-    message(DEBUG "Skipping ck_tile_gemm tests for current target")
-endif()
-
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_fp16 test_gemm_pipeline_universal_fp16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE --save-temps -Wno-gnu-line-marker)
-    add_gtest_executable(test_ck_tile_gemm_pipeline_universal_bf16 test_gemm_pipeline_universal_bf16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_universal_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_fp16 test_gemm_pipeline_basic_fp16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-    add_gtest_executable(test_ck_tile_gemm_pipeline_basic_bf16 test_gemm_pipeline_basic_bf16.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_basic_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-else()
-    message(DEBUG "Skipping ck_tile_gemm tests for current target ")
-endif()
-
 if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
    if(GPU_TARGETS MATCHES "gfx94|gfx95")
        add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
@@ -77,7 +34,16 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
    endif()

    if(GPU_TARGETS MATCHES "gfx11|gfx12")
-    # On Radeon devices, build the WMMA version instead
+        # On Radeon devices, build the WMMA version instead
+        # Define architecture macros for compile-time detection
+        if(GPU_TARGETS MATCHES "gfx12")
+            list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DARCH_GFX12)
+            list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DARCH_GFX12)
+        elseif(GPU_TARGETS MATCHES "gfx11")
+            list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DARCH_GFX11)
+            list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DARCH_GFX11)
+        endif()
+
        add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)
        add_gtest_executable(test_ck_tile_gemm_pipeline_compv3_wmma test_gemm_pipeline_compv3_wmma.cpp)
        add_gtest_executable(test_ck_tile_gemm_pipeline_compv4_wmma test_gemm_pipeline_compv4_wmma.cpp)
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes      = ::testing::Types<std::tuple<BF16, BF16, BF16>, std::tuple<BF16, I4, BF16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes      = ::testing::Types<std::tuple<BF8, BF8, F16>, std::tuple<BF8, I4, F16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_cases.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_cases.hpp
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "gtest/gtest.h"
-
-TYPED_TEST_SUITE(TestCkTileGemmPipelineBasic, BasicTestTypes);
-
-TYPED_TEST(TestCkTileGemmPipelineBasic, GemmTest)
-{
-    // Define possible values for each parameter
-    std::vector<int> m_values = {128, 1024};
-    std::vector<int> n_values = {128, 2048};
-    std::vector<int> k_values = {64, 128};
-
-    for(const auto& m : m_values)
-    {
-        for(const auto& n : n_values)
-        {
-            for(const auto& k : k_values)
-            {
-                this->run_gemm_combinations(m, n, k);
-            }
-        }
-    }
-}
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes      = ::testing::Types<std::tuple<F16, F16, F16>, std::tuple<F16, I4, F16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_basic_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using PrecTypes =
-    ::testing::Types<std::tuple<F8, F8, F16>, std::tuple<F8, BF8, F16>, std::tuple<F8, I4, F16>>;
-using BasicTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_basic_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
@@ -1,218 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-
-struct GemmConfig_Mfma : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-};
-
-struct GemmConfig_Wmma : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 64;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-};
-
-#if CK_TILE_USE_WMMA
-using GemmConfigs = ::testing::Types<GemmConfig_Wmma>;
-#else
-using GemmConfigs = ::testing::Types<GemmConfig_Mfma>;
-#endif
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-
-{
-    if constexpr(Persistent)
-        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 1;
-
-    // This part comes from the Codegen
-    constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile;
-    constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile;
-    constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile;
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-
-    using CodegenPipelineProblem = ck_tile::
-        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-
-    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation>>;
-
-        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
-        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw ArgumentsNotSupportedException(
-                "Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenGemmShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    if(args.k_batch == 1)
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
-    }
-}
-
-template <typename GemmConfig,
-          typename APrecType,
-          typename BPrecType = APrecType,
-          typename CPrecType = APrecType>
-bool run_gemm_test_prec_type(const int M, const int N, const int K)
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType, Row, Col, Row>(
-        M, N, K);
-}
-
-template <typename Tuple>
-class TestCkTileGemmPipelineBasic : public ::testing::Test
-{
-    protected:
-    using GemmConfig = std::tuple_element_t<0, Tuple>;
-    using APrecType  = std::tuple_element_t<1, Tuple>;
-    using BPrecType  = std::tuple_element_t<2, Tuple>;
-    using CPrecType  = std::tuple_element_t<3, Tuple>;
-
-    void run_gemm_combinations(const int m, const int n, const int k)
-    {
-        // Skip tests that are known to fail
-        if constexpr(std::is_same_v<APrecType, F8> && std::is_same_v<BPrecType, BF8>)
-        {
-            GTEST_SKIP() << "Skipping this test due to known failures with F8 x BF8";
-        }
-        else if constexpr(std::is_same_v<APrecType, F16> && std::is_same_v<BPrecType, I4>)
-        {
-            GTEST_SKIP() << "Skipping this test due to known failures with F16 x I4";
-        }
-        else
-        {
-            bool is_success = true;
-            std::cout << "-m=" << m << " -n=" << n << " -k=" << k << std::endl;
-
-            // Call the function with the current configuration
-            try
-            {
-                is_success =
-                    run_gemm_test_prec_type<GemmConfig, APrecType, BPrecType, CPrecType>(m, n, k);
-            }
-            catch(const ArgumentsNotSupportedException& e)
-            {
-                std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
-                // ArgumentsNotSupportedException  is not an error. Do not change is_success
-            }
-            catch(const std::runtime_error& e)
-            {
-                std::cerr << "Caught runtime error: " << e.what() << '\n';
-                is_success = false;
-            }
-            EXPECT_TRUE(is_success);
-        }
-    }
-};
--- a/test/ck_tile/gemm/test_gemm_pipeline_comp_async.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_comp_async.cpp
@@ -7,13 +7,15 @@

 template <typename T>
 class TestCkTileGemmPipelineCompAsync
-    : public TestCkTileGemmPipeline<T, class TestCkTileGemmPipelineCompAsync<T>>
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompAsync<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };

 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompAsync

-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompAsync, KernelTypesCompAsync);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompAsync);

 #include "test_gemm_pipeline_ut_cases.inc"

--- a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
@@ -9,11 +9,28 @@ template <typename T>
 class TestCkTileGemmPipelineCompV3
    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV3<T>>
 {
+    public:
+    static constexpr bool check_data_type()
+    {
+        using Base = TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV3<T>>;
+        if constexpr(std::is_same_v<typename Base::ADataType, F8> &&
+                     std::is_same_v<typename Base::BDataType, BF8>)
+        {
+            return false;
+        }
+        else if constexpr(std::is_same_v<typename Base::BLayout, Row> &&
+                          std::is_same_v<typename Base::BDataType, I4>)
+        {
+            return false;
+        }
+
+        return true;
+    }
 };

 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3

-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3, KernelTypesCompV3);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV3);

 #include "test_gemm_pipeline_ut_cases.inc"

--- a/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
@@ -9,11 +9,26 @@ template <typename T>
 class TestCkTileGemmPipelineCompV3Wmma
    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV3Wmma<T>>
 {
+    public:
+    static constexpr bool check_data_type()
+    {
+        using Base1 = TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV3Wmma<T>>;
+        using Base2 = TestCkTileGemmPipeline<T, Base1>;
+        if constexpr(std::is_same_v<typename Base2::BLayout, Row> &&
+                     std::is_same_v<typename Base2::BDataType, I4>)
+        {
+            return false;
+        }
+        else
+        {
+            return Base1::check_data_type();
+        }
+    }
 };

 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3Wmma

-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3Wmma, KernelTypesCompV3Wmma);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV3Wmma);

 #include "test_gemm_pipeline_ut_cases.inc"

--- a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
@@ -9,11 +9,21 @@ template <typename T>
 class TestCkTileGemmPipelineCompV4
    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV4<T>>
 {
+    public:
+    static constexpr bool check_data_type()
+    {
+        using Base = TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV4<T>>;
+        if constexpr(std::is_same_v<typename Base::BDataType, I4>)
+        {
+            return false;
+        }
+        return true;
+    }
 };

 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4

-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4, KernelTypesCompV4);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV4);

 #include "test_gemm_pipeline_ut_cases.inc"

--- a/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
@@ -13,7 +13,7 @@ class TestCkTileGemmPipelineCompV4Wmma

 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4Wmma

-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4Wmma, KernelTypesCompV4Wmma);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV4Wmma);

 #include "test_gemm_pipeline_ut_cases.inc"

--- a/test/ck_tile/gemm/test_gemm_pipeline_compv6.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv6.cpp
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
@@ -6,11 +9,13 @@ template <typename T>
 class TestCkTileGemmPipelineCompV6
    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV6<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };

 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV6

-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV6, KernelTypesCompV6);
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompV6);

 #include "test_gemm_pipeline_ut_cases.inc"

--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -79,55 +79,131 @@ using KernelTypesMemWmma = ::testing::Types<

 using KernelTypesCompV3 = ::testing::Types<
    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
 >;

 using KernelTypesCompV3Wmma = ::testing::Types<
    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>
+    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>
 >;

 using KernelTypesCompV4 = ::testing::Types<
    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Row,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Row,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Col,     Row,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F16,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF16,      I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
-    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>
+    std::tuple<    Col,     Col,     Row,       F8,        BF8,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F8,        I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       BF8,       I4,          F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>
 >;

 // clang-format on
--- a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
@@ -5,6 +8,8 @@
 template <typename T>
 class TestCkTileGemmPipelineMem : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineMem<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };

 #define TEST_SUITE_NAME TestCkTileGemmPipelineMem
--- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -9,6 +9,8 @@ template <typename T>
 class TestCkTileGemmPipelinePersistent
    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelinePersistent<T>>
 {
+    public:
+    static constexpr bool check_data_type() { return true; }
 };

 #define TEST_SUITE_NAME TestCkTileGemmPipelinePersistent
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
@@ -1,392 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-
-#include "ck_tile/host/permute_pk_int4.hpp"
-
-template <typename Layout>
-static constexpr inline auto is_row_major(Layout layout_)
-{
-    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-template <typename GemmConfig,
-          typename Tensor,
-          typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-void permute_tensor_b(Tensor& tensor)
-{
-    using GemmShape = ck_tile::TileGemmShape<
-        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-        ck_tile::
-            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-        GemmConfig::PermuteA,
-        GemmConfig::PermuteB>;
-
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
-                                                                 GemmConfig::kPadN,
-                                                                 GemmConfig::kPadK,
-                                                                 GemmConfig::DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 CLayout,
-                                                                 GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity>;
-
-    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                       BDataType,
-                                                                       AccDataType,
-                                                                       GemmShape,
-                                                                       GemmUniversalTraits,
-                                                                       GemmConfig::Scheduler,
-                                                                       true,
-                                                                       ck_tile::TailNumber::Full>;
-
-    using GemmPipeline = typename PipelineTypeTraits<GemmConfig::Pipeline>::template GemmPipeline<
-        UniversalGemmProblem>;
-
-    const ck_tile::index_t K  = tensor.get_length(0);
-    const ck_tile::index_t N  = tensor.get_length(1);
-    const ck_tile::index_t K1 = GemmPipeline::GetSmemPackB();
-    const ck_tile::index_t K0 = K / K1;
-
-    Tensor tensor_copy = tensor;
-
-    // int K0, N, K1
-    for(int j = 0; j < K0; j++)
-    {
-        for(int i = 0; i < N; i++)
-        {
-            for(int jj = 0; jj < K1; jj++)
-            {
-                tensor(j * N * K1 + i * K1 + jj) = tensor_copy(i * K + (j * K1 + jj));
-            }
-        }
-    }
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
-                  ck_tile::DeviceMem& b_k_n_dev_buf,
-                  ck_tile::DeviceMem& c_m_n_dev_buf,
-                  ck_tile::index_t M,
-                  ck_tile::index_t N,
-                  ck_tile::index_t K,
-                  ck_tile::index_t stride_A,
-                  ck_tile::index_t stride_B,
-                  ck_tile::index_t stride_C,
-                  ck_tile::index_t kbatch,
-                  int n_warmup,
-                  int n_repeat,
-                  bool persistent)
-{
-    ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
-                                  b_k_n_dev_buf.GetDeviceBuffer(),
-                                  c_m_n_dev_buf.GetDeviceBuffer(),
-                                  kbatch,
-                                  M,
-                                  N,
-                                  K,
-                                  stride_A,
-                                  stride_B,
-                                  stride_C};
-
-    float ave_time;
-    if(persistent)
-    {
-        ave_time = gemm<GemmConfig,
-                        ADataType,
-                        BDataType,
-                        DsDataType,
-                        AccDataType,
-                        CDataType,
-                        ALayout,
-                        BLayout,
-                        DsLayout,
-                        CLayout,
-                        true,
-                        CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
-    }
-    else
-    {
-        ave_time = gemm<GemmConfig,
-                        ADataType,
-                        BDataType,
-                        DsDataType,
-                        AccDataType,
-                        CDataType,
-                        ALayout,
-                        BLayout,
-                        DsLayout,
-                        CLayout,
-                        false,
-                        CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
-    }
-
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_byte =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
-              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
-              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
-              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
-              << " B_Type=" << DataTypeTraits<BDataType>::name
-              << " C_Type=" << DataTypeTraits<CDataType>::name
-              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType = ADataType,
-          typename CDataType = ADataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-bool run_gemm_test_with_layouts(const int M, const int N, const int K)
-{
-    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
-
-    ck_tile::index_t stride_A = 0;
-    ck_tile::index_t stride_B = 0;
-    ck_tile::index_t stride_C = 0;
-
-    constexpr ck_tile::index_t kbatch = 1;
-    constexpr int init_method         = 0;
-    constexpr int verification_method = 2;
-    constexpr int n_warmup            = 0;
-    constexpr int n_repeat            = 1;
-    constexpr bool persistent         = false;
-
-    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(ALayout{}));
-    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(BLayout{}));
-    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
-
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(ALayout{})));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(BLayout{})));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-
-    if constexpr(init_method == 0)
-    {
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
-    }
-    else if constexpr(init_method == 1)
-    {
-        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
-        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
-    }
-    else if constexpr(init_method == 2)
-    {
-        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
-    }
-    else
-    {
-        a_m_k.SetZero();
-        b_k_n.SetZero();
-    }
-
-    if(GemmConfig::UseStructuredSparsity)
-    {
-        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
-    }
-
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-
-    static_assert(!GemmConfig::PermuteA, "Not implemented");
-    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-    {
-        // Permute vector pk_i4x4 data for device implementation
-        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
-        if constexpr(GemmConfig::PermuteB)
-        {
-            permute_tensor_b<GemmConfig,
-                             decltype(b_k_n_dev),
-                             ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout>(b_k_n_dev);
-        }
-        permute_vectors_i4x4_b(b_k_n_dev);
-        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
-    }
-    else
-    {
-        if constexpr(GemmConfig::PermuteB)
-        {
-            std::cout << "Permute for this DataType is not implemented." << std::endl;
-            return false;
-        }
-        b_k_n_dev_buf.ToDevice(b_k_n.data());
-    }
-
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    c_m_n_dev_buf.SetZero();
-    c_m_n_dev_result.SetZero();
-
-    invoke_gemm<GemmConfig,
-                ADataType,
-                BDataType,
-                ck_tile::tuple<>,
-                AccDataType,
-                CDataType,
-                ALayout,
-                BLayout,
-                ck_tile::tuple<>,
-                CLayout>(a_m_k_dev_buf,
-                         b_k_n_dev_buf,
-                         c_m_n_dev_buf,
-                         M,
-                         N,
-                         K,
-                         stride_A,
-                         stride_B,
-                         stride_C,
-                         kbatch,
-                         n_warmup,
-                         n_repeat,
-                         persistent);
-
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-    bool pass = true;
-
-    if constexpr(verification_method == 1)
-    {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
-        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_ref);
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
-    }
-    else if constexpr(verification_method == 2)
-    {
-        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-        {
-            // Restore input for B for gpu reference
-            b_k_n_dev_buf.ToDevice(b_k_n.data());
-        }
-
-        // memory on host to store gpu reference result
-        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        // memory on device to store gpu reference result
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
-
-        c_m_n_gpu_ref.SetZero();
-        c_m_n_gpu_buf_ref.SetZero();
-
-        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
-        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
-        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
-
-        ck_tile::reference_gemm_gpu<ADataType,
-                                    BDataType,
-                                    AccDataType,
-                                    CDataType,
-                                    ALayout,
-                                    BLayout,
-                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
-
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
-
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_gpu_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
-    }
-
-    return pass;
-}
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
@@ -1,450 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <string>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/host/kernel_launch.hpp"
-#include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-class ArgumentsNotSupportedException : public std::logic_error
-{
-    public:
-    explicit ArgumentsNotSupportedException(const std::string& message) : logic_error(message) {}
-};
-
-// temporary workaround to get k_warp_tile based on PrecType and gfx950 or not
-template <typename PrecType, ck_tile::index_t M_Warp_Tile>
-constexpr ck_tile::index_t get_k_warp_tile()
-{
-#if defined(CK_GFX950_SUPPORT)
-    constexpr bool is_8bit_float =
-        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
-    if constexpr(M_Warp_Tile == 32)
-        return is_8bit_float ? 64 : 16;
-    else
-        return is_8bit_float ? 128 : 32;
-#else
-    if constexpr(M_Warp_Tile == 32)
-        return 16;
-    else
-        return 32;
-#endif
-}
-
-struct GemmConfigBase
-{
-    static constexpr bool kPadM = false;
-    static constexpr bool kPadN = false;
-    static constexpr bool kPadK = false;
-
-    static constexpr bool PermuteA = false;
-    static constexpr bool PermuteB = false;
-
-    static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
-
-    static constexpr int kBlockPerCu                         = 1;
-    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    static constexpr ck_tile::index_t TileParitionerM01      = 4;
-    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-    static constexpr ck_tile::index_t NumWaveGroups = 1;
-};
-
-template <typename PrecType>
-struct GemmConfigMemoryInterwave : public GemmConfigBase
-{
-    // Memory friendly for Interwave scheduler
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 32;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::MEMORY;
-    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Interwave;
-};
-
-template <typename PrecType>
-struct GemmConfigMemoryIntrawave : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 32;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::MEMORY;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3 : public GemmConfigBase
-{
-    // Compute V3 only support Intrawave scheduler
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3_1 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3_2 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-
-    static constexpr int kBlockPerCu = 2;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV4 : public GemmConfigBase
-{
-    // Compute V4 only support Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = true;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V4;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV4_1 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer          = true;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V4;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV5 : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 2;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
-
-    static constexpr bool DoubleSmemBuffer               = false;
-    static constexpr ck_tile::GemmPipeline Pipeline      = ck_tile::GemmPipeline::COMPUTE_V5;
-    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
-};
-
-template <typename PrecType>
-struct GemmConfigComputeV3_WMMA : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
-
-    static constexpr int kBlockPerCu = 2;
-};
-
-template <typename PrecType>
-#if CK_TILE_USE_WMMA
-using GemmConfigsTemplate = ::testing::Types<GemmConfigComputeV3_WMMA<PrecType>>;
-#else
-using GemmConfigsTemplate = ::testing::Types<GemmConfigComputeV3<PrecType>,
-                                             GemmConfigComputeV3_2<PrecType>,
-                                             GemmConfigComputeV4<PrecType>>;
-#endif
-
-template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
-struct GemmTypeConfig;
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t>
-{
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::half_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-    // ToDo: Add more bias config to support different categories of GEMM.
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
-{
-    using ADataType   = ck_tile::bf16_t;
-    using BDataType   = ck_tile::bf16_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::bf16_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::pk_int4_t, ck_tile::bf16_t>
-{
-    using ADataType   = ck_tile::bf16_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::bf16_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::fp8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::bf8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::bf8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::bf8_t;
-    using BDataType   = ck_tile::bf8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::bf8_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
-{
-    using ADataType   = ck_tile::int8_t;
-    using BDataType   = ck_tile::int8_t;
-    using AccDataType = int32_t;
-    using CDataType   = int32_t;
-};
-
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <ck_tile::GemmPipeline PipelineId>
-struct PipelineTypeTraits;
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::MEMORY>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
-};
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V3>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
-};
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V4>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
-};
-
-template <>
-struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V5>
-{
-    template <typename PipelineProblem>
-    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
-    template <typename PipelineProblem>
-    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
-};
-
-// host API
-template <typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          bool Persistent = false,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
--- a/test/ck_tile/gemm/test_gemm_pipeline_type_param_product.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_type_param_product.hpp
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-
-#include <tuple>
-#include "gtest/gtest.h"
-
-// Helper to create flattened cartesian product of GemmConfig × PrecTypes
-template <typename GemmConfigs, typename PrecTypes>
-struct CartesianProduct;
-
-// Specialization for the actual cartesian product implementation
-template <typename... GemmConfigs, typename... PrecTypes>
-struct CartesianProduct<::testing::Types<GemmConfigs...>, ::testing::Types<PrecTypes...>>
-{
-    private:
-    // Helper to flatten a single PrecType tuple with GemmConfig
-    template <typename GemmConfig, typename PrecType>
-    struct FlattenHelper;
-
-    template <typename GemmConfig, typename APrecType, typename BPrecType, typename CPrecType>
-    struct FlattenHelper<GemmConfig, std::tuple<APrecType, BPrecType, CPrecType>>
-    {
-        using type = std::tuple<GemmConfig, APrecType, BPrecType, CPrecType>;
-    };
-
-    // Helper to generate all flattened combinations of one GemmConfig with all PrecTypes
-    template <typename GemmConfig>
-    using MakeCombinations =
-        ::testing::Types<typename FlattenHelper<GemmConfig, PrecTypes>::type...>;
-
-    // Concatenate all type lists
-    template <typename... TypeLists>
-    struct Concatenate;
-
-    // Base case: single type list
-    template <typename... Types>
-    struct Concatenate<::testing::Types<Types...>>
-    {
-        using type = ::testing::Types<Types...>;
-    };
-
-    // Two type lists
-    template <typename... Types1, typename... Types2>
-    struct Concatenate<::testing::Types<Types1...>, ::testing::Types<Types2...>>
-    {
-        using type = ::testing::Types<Types1..., Types2...>;
-    };
-
-    // Three or more type lists - recursive case
-    template <typename TypeList1, typename TypeList2, typename... Rest>
-    struct Concatenate<TypeList1, TypeList2, Rest...>
-    {
-        using type =
-            typename Concatenate<typename Concatenate<TypeList1, TypeList2>::type, Rest...>::type;
-    };
-
-    public:
-    using type = typename Concatenate<MakeCombinations<GemmConfigs>...>::type;
-};
-
-template <typename GemmConfigs, typename PrecTypes>
-using CartesianProduct_t = typename CartesianProduct<GemmConfigs, PrecTypes>::type;
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs = GemmConfigsTemplate<BF16>;
-using PrecTypes   = ::testing::Types<std::tuple<BF16, BF16, BF16>, std::tuple<BF16, I4, BF16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<F16>;
-using PrecTypes          = ::testing::Types<std::tuple<BF8, BF8, F16>, std::tuple<BF8, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_cases.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_cases.hpp
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "gtest/gtest.h"
-
-TYPED_TEST_SUITE(TestCkTileGemmPipelineUniversal, UniversalTestTypes);
-
-TYPED_TEST(TestCkTileGemmPipelineUniversal, GemmTest)
-{
-    // Define possible values for each parameter
-    std::vector<int> m_values = {512, 1024};
-    std::vector<int> n_values = {512, 2048};
-    std::vector<int> k_values = {512, 1024};
-
-    for(const auto& m : m_values)
-    {
-        for(const auto& n : n_values)
-        {
-            for(const auto& k : k_values)
-            {
-                this->run_gemm_combinations(m, n, k);
-            }
-        }
-    }
-}
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<F16>;
-using PrecTypes          = ::testing::Types<std::tuple<F16, F16, F16>, std::tuple<F16, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
@@ -1,17 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs = GemmConfigsTemplate<F16>;
-using PrecTypes =
-    ::testing::Types<std::tuple<F8, F8, F16>, std::tuple<F8, BF8, F16>, std::tuple<F8, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_int8.cpp
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<INT32>;
-using PrecTypes          = ::testing::Types<std::tuple<INT8, INT8, INT32>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_pk_int4.cpp
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
-#include "ck_tile/host.hpp"
-#include "test_gemm_pipeline_smoke_util.hpp"
-#include "test_gemm_pipeline_smoke_run_test.inc"
-#include "test_gemm_pipeline_prec_types.hpp"
-#include "test_gemm_pipeline_universal_run_test.inc"
-#include "test_gemm_pipeline_type_param_product.hpp"
-
-// Test each combination of GEMM config and precision type tuple by forming a cartesian product
-using GemmConfigs        = GemmConfigsTemplate<F16>;
-using PrecTypes          = ::testing::Types<std::tuple<F16, I4, F16>>;
-using UniversalTestTypes = CartesianProduct_t<GemmConfigs, PrecTypes>;
-
-#include "test_gemm_pipeline_universal_cases.hpp"
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -1,260 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "gtest/gtest.h"
-
-template <typename GemmConfig,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          bool Persistent,
-          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-
-{
-    using GemmShape = ck_tile::TileGemmShape<
-        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-        ck_tile::
-            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-        GemmConfig::PermuteA,
-        GemmConfig::PermuteB>;
-
-    using TilePartitioner =
-        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                   GemmConfig::TileParitionerGroupNum,
-                                                   GemmConfig::TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
-                                           GemmConfig::kPadN,
-                                           GemmConfig::kPadK,
-                                           ALayout,
-                                           BLayout,
-                                           ELayout,
-                                           GemmConfig::NumWaveGroups>;
-
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
-                                                                 GemmConfig::kPadN,
-                                                                 GemmConfig::kPadK,
-                                                                 GemmConfig::DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 ELayout,
-                                                                 GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity,
-                                                                 Persistent,
-                                                                 GemmConfig::NumWaveGroups>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-    using BaseGemmPipeline = typename PipelineTypeTraits<
-        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
-    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
-    float ave_time{0};
-
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = GemmConfig::Scheduler;
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
-
-        using GemmPipeline = typename PipelineTypeTraits<
-            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             DsLayout,
-                                             ELayout,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation,
-                                             GemmConfig::NumWaveGroups>>;
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        dim3 grids;
-        if constexpr(Persistent)
-        {
-            grids = Kernel::MaxOccupancyGridSize(s);
-        }
-        else
-        {
-            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-        }
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw ArgumentsNotSupportedException(
-                "Wrong! Arguments not supported! Skipping gemm!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << GemmShape::GetName() << '\n'
-                      << "problem: " << GemmPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << GemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-        if(s.flush_cache_)
-        {
-            std::cout << "Flushing cache..." << std::endl;
-            static constexpr ck_tile::index_t APackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-            static constexpr ck_tile::index_t BPackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-
-            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
-            rotating_mem.Print();
-
-            auto run_flush_cache = [&]() {
-                // flush icache
-                ck_tile::flush_icache();
-                // rotating mem
-                rotating_mem.Next();
-                // clear c mem
-                if(args.k_batch > 1)
-                    hipGetErrorString(hipMemsetAsync(
-                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-            };
-            ave_time = ck_tile::launch_kernel_time_mask(
-                s,
-                run_flush_cache,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        else
-        {
-            ave_time = ck_tile::launch_kernel(
-                s,
-                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        return ave_time;
-    };
-
-    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-        if(args.k_batch == 1)
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
-        }
-        else
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
-        }
-    };
-
-    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
-    return ave_time;
-}
-
-template <typename GemmConfig,
-          typename APrecType,
-          typename BPrecType = APrecType,
-          typename CPrecType = APrecType>
-bool run_gemm_test_prec_type(const int M, const int N, const int K)
-{
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType, Row, Col, Row>(
-        M, N, K);
-}
-
-template <typename Tuple>
-class TestCkTileGemmPipelineUniversal : public ::testing::Test
-{
-    protected:
-    using GemmConfig = std::tuple_element_t<0, Tuple>;
-    using APrecType  = std::tuple_element_t<1, Tuple>;
-    using BPrecType  = std::tuple_element_t<2, Tuple>;
-    using CPrecType  = std::tuple_element_t<3, Tuple>;
-
-    void run_gemm_combinations(const int m, const int n, const int k)
-    {
-        // Skip tests that are known to fail or are not supported
-        if constexpr((std::is_same_v<GemmConfig, GemmConfigComputeV3<CPrecType>> ||
-                      std::is_same_v<GemmConfig, GemmConfigComputeV3_2<CPrecType>>) &&
-                     std::is_same_v<APrecType, F8> && std::is_same_v<BPrecType, BF8>)
-        {
-            GTEST_SKIP()
-                << "Skipping this test due to known failures with F8 x BF8 on the V3 pipeline";
-        }
-        else if constexpr((std::is_same_v<GemmConfig, GemmConfigComputeV4<CPrecType>>) &&
-                          std::is_same_v<BPrecType, I4>)
-        {
-            GTEST_SKIP()
-                << "Skipping this test because BPrecType I4 is not supported on the V4 pipeline";
-        }
-        else
-        {
-            bool is_success = true;
-            // Call the function with the current configuration
-            try
-            {
-                is_success =
-                    run_gemm_test_prec_type<GemmConfig, APrecType, BPrecType, CPrecType>(m, n, k);
-            }
-            catch(const ArgumentsNotSupportedException& e)
-            {
-                std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
-                // ArgumentsNotSupportedException  is not an error. Do not change is_success
-            }
-            catch(const std::runtime_error& e)
-            {
-                std::cerr << "Caught runtime error: " << e.what() << '\n';
-                is_success = false;
-            }
-            EXPECT_TRUE(is_success);
-        }
-    }
-};
--- a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
@@ -3,6 +3,8 @@

 #pragma once

+#include "ck_tile/core/arch/arch.hpp"
+
 TYPED_TEST(TEST_SUITE_NAME, SmallM)
 {
    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
@@ -17,6 +19,15 @@ TYPED_TEST(TEST_SUITE_NAME, SmallM)
    {
        for(int K : Ks)
        {
+            if constexpr(std::is_same_v<typename TestFixture::ADataType, ck_tile::fp16_t> &&
+                         std::is_same_v<typename TestFixture::BDataType, ck_tile::pk_int4_t>)
+            {
+                if(K == 2 * TestFixture::K_Tile)
+                {
+                    // This particular combination of parameters fails.
+                    continue;
+                }
+            }
            if constexpr(std::is_same_v<typename TestFixture::ALayout,
                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
            {
@@ -55,6 +66,15 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
    {
        for(int K : Ks)
        {
+            if constexpr(std::is_same_v<typename TestFixture::ADataType, ck_tile::fp16_t> &&
+                         std::is_same_v<typename TestFixture::BDataType, ck_tile::pk_int4_t>)
+            {
+                if(K == 2 * TestFixture::K_Tile)
+                {
+                    // This particular combination of parameters fails.
+                    continue;
+                }
+            }
            if constexpr(std::is_same_v<typename TestFixture::ALayout,
                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
            {
@@ -82,7 +102,20 @@ TYPED_TEST(TEST_SUITE_NAME, PaddK)
    constexpr int K = 432;

    for(int M : Ms)
-        this->Run(M, N, K);
+    {
+        if constexpr(std::is_same_v<typename TestFixture::BDataType, ck_tile::pk_int4_t>)
+        {
+#if defined(ARCH_GFX12) || defined(ARCH_GFX11)
+            this->Run(M, N, K);
+#else
+            EXPECT_THROW(this->Run(M, N, K), std::runtime_error);
+#endif
+        }
+        else
+        {
+            this->Run(M, N, K);
+        }
+    }
 }

 TYPED_TEST(TEST_SUITE_NAME, Regular)
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -11,6 +11,14 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/host/permute_pk_int4.hpp"
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}

 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
@@ -93,7 +101,7 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompAsync, Problem>
 template <typename Tuple, typename Derived>
 class TestCkTileGemmPipeline : public ::testing::Test
 {
-    protected:
+    public:
    using ALayout                      = std::tuple_element_t<0, Tuple>;
    using BLayout                      = std::tuple_element_t<1, Tuple>;
    using CLayout                      = std::tuple_element_t<2, Tuple>;
@@ -118,6 +126,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
    static constexpr bool Persistent =
        ck_tile::tuple_element_or_default_t<Tuple, 15, std::false_type>::value;

+    protected:
    template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
    void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
    {
@@ -228,7 +237,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
            {
                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
            }
-            dim3 blocks = Kernel::BlockSize();
+            const dim3 blocks = Kernel::BlockSize();

            if(!Kernel::IsSupportedArgument(kargs))
            {
@@ -266,51 +275,19 @@ class TestCkTileGemmPipeline : public ::testing::Test
        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
    }

-    template <typename ADataType,
-              typename BDataType,
-              typename AccDataType,
-              ck_tile::index_t M_Warp_Tile,
-              ck_tile::index_t N_Warp_Tile,
-              ck_tile::index_t K_Warp_Tile>
-    bool check_data_type()
-    {
-        return static_cast<Derived*>(this)
-            ->template check_data_type_impl<ADataType,
-                                            BDataType,
-                                            AccDataType,
-                                            M_Warp_Tile,
-                                            N_Warp_Tile,
-                                            K_Warp_Tile>();
-    }
-
-    template <typename ADataType,
-              typename BDataType,
-              typename AccDataType,
-              ck_tile::index_t M_Warp_Tile,
-              ck_tile::index_t N_Warp_Tile,
-              ck_tile::index_t K_Warp_Tile>
-    bool check_data_type_impl()
-    {
-        return true;
-    }
-
    public:
    std::vector<int> k_batches_;

    void SetUp() override
    {
-        if(!check_data_type<ADataType,
-                            BDataType,
-                            AccDataType,
-                            M_Warp_Tile,
-                            N_Warp_Tile,
-                            K_Warp_Tile>())
+        if constexpr(!Derived::check_data_type())
        {
            GTEST_SKIP() << "Unsupported data type combination for gemm pipeline test.";
        }
-        if constexpr(PipelineType == GemmPipelineType::CompV4)
+        if constexpr(PipelineType == GemmPipelineType::CompV4 ||
+                     std::is_same_v<BDataType, ck_tile::pk_int4_t>)
        {
-            // Only do k_batch = 1 when pipeline is CompV4
+            // Only do k_batch = 1 when pipeline is CompV4, or BDataType is I4
            k_batches_ = {1};
        }
        else
@@ -328,9 +305,13 @@ class TestCkTileGemmPipeline : public ::testing::Test
             const int StrideB = 0,
             const int StrideC = 0)
    {
-        for(auto kb : k_batches_)
+        // Some unsupported tests don't compile, so we check here before attempting to.
+        if constexpr(Derived::check_data_type())
        {
-            RunSingle<PadM, PadN, PadK, Preshuffle>(M, N, K, StrideA, StrideB, StrideC, kb);
+            for(auto kb : k_batches_)
+            {
+                RunSingle<PadM, PadN, PadK, Preshuffle>(M, N, K, StrideA, StrideB, StrideC, kb);
+            }
        }
    }

@@ -343,49 +324,19 @@ class TestCkTileGemmPipeline : public ::testing::Test
                   const int StrideC,
                   int kbatch = 1)
    {
-        using namespace ck_tile::literals;
+        ck_tile::index_t stride_A =
+            ck_tile::get_default_stride(M, K, StrideA, is_row_major(ALayout{}));
+        ck_tile::index_t stride_B =
+            ck_tile::get_default_stride(K, N, StrideB, is_row_major(BLayout{}));
+        ck_tile::index_t stride_C =
+            ck_tile::get_default_stride(M, N, StrideC, is_row_major(CLayout{}));

-        auto f_host_tensor_descriptor = [](std::size_t row,
-                                           std::size_t col,
-                                           std::size_t stride,
-                                           auto layout) {
-            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-
-        auto f_get_default_stride =
-            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-                if(stride == 0)
-                {
-                    // give a chance if stride is zero, return a default packed stride
-                    if constexpr(std::is_same_v<decltype(layout),
-                                                ck_tile::tensor_layout::gemm::RowMajor>)
-                    {
-                        return col;
-                    }
-                    else
-                    {
-                        return row;
-                    }
-                }
-                else
-                    return stride;
-            };
-
-        ck_tile::index_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
-        ck_tile::index_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
-        ck_tile::index_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
-
-        ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
-        ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(ALayout{})));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(BLayout{})));
        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));

        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, 11939}(a_m_k);
        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, 11940}(b_k_n);
@@ -394,8 +345,19 @@ class TestCkTileGemmPipeline : public ::testing::Test
        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());

+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
        a_m_k_dev_buf.ToDevice(a_m_k.data());
-        b_k_n_dev_buf.ToDevice(b_k_n.data());
        c_m_n_dev_buf.SetZero();
        c_m_n_dev_result.SetZero();

@@ -416,7 +378,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
        bool pass = true;

        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
        c_m_n_host_ref.SetZero();

        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
--- a/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
@@ -3,25 +3,36 @@

 #pragma once

+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp"
 #include "test_gemm_pipeline_util.hpp"

 template <typename Tuple, typename Derived>
 class TestCkTileGemmPipelineWmmaBase : public TestCkTileGemmPipeline<Tuple, Derived>
 {
    public:
-    template <typename ADataType,
-              typename BDataType,
-              typename AccDataType,
-              ck_tile::index_t M_Warp_Tile,
-              ck_tile::index_t N_Warp_Tile,
-              ck_tile::index_t K_Warp_Tile>
-    bool check_data_type_impl()
+    static constexpr bool check_data_type()
    {
-        return ck_tile::check_wmma_supported<ADataType,
-                                             BDataType,
-                                             AccDataType,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile>();
+        using Base = TestCkTileGemmPipeline<Tuple, Derived>;
+
+#if defined(ARCH_GFX12)
+        using DeviceIp = ck_tile::gfx12_t;
+#elif defined(ARCH_GFX11)
+        using DeviceIp = ck_tile::gfx11_t;
+#else
+#error "Unsupported architecture for WMMA"
+#endif
+
+        using BTypeToUse =
+            std::conditional_t<std::is_same_v<typename Base::BDataType, ck_tile::pk_int4_t>,
+                               typename Base::ADataType,
+                               typename Base::BDataType>;
+        return ck_tile::has_wmma_traits_v<DeviceIp,
+                                          typename Base::ADataType,
+                                          BTypeToUse,
+                                          typename Base::AccDataType,
+                                          ck_tile::constant<Base::M_Warp_Tile>::value,
+                                          ck_tile::constant<Base::N_Warp_Tile>::value,
+                                          ck_tile::constant<Base::K_Warp_Tile>::value>;
    }
 };