From bea67cb5209938ae662ff9dd62b00010bb09cc78 Mon Sep 17 00:00:00 2001
From: JP-Fernando <103817231+JP-Fernando@users.noreply.github.com>
Date: Thu, 26 Feb 2026 00:22:05 +0100
Subject: [PATCH] [CK] Remove duplicated XDL/WMMA tests (#4415)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Motivation

When we started the RDNA4 support, the XDL instances were not supporting
WMMA instructions, so we duplicated some tests.

In this issue, we simplified most of the duplicated test files into
common test files.

## Technical Details

The following tests were unified:

- `batched_gemm`

- `batched_gemm_gemm`

- `gemm_add`

- `gemm_universal`

- `grouped_convnd_bwd_data`

The following tests were duplicated exactly, and copied into two files
with `_xdl` and `_wmma` suffixes. Now they are unified in one single
file without suffix:

- `gemm_multi_abd`

- `gemm_b_scale`

There is still an apparent duplication which is a special case, namely
`test_grouped_convnd_bwd_weight_interface_{suffix}` where `{suffix}` is
`xdl` or `wmma`.
However, the WMMA code relies on an old implementation, and is expected
to be removed in the future. In addition, it differs from the XDL
implementation significantly.
Therefore, it was decided to keep both files separate instead of
attempting any unification.

## Test Plan

`CMakeLists.txt` files were modified to support the new, unified tests.
In particular, testing was done for `gfx90a`, `gfx1201` and `gfx11`
architectures.

## Test Result

All tests passed successfully on all three tested architectures.

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

---------

Co-authored-by: Fernando Jiménez <fernando.jimenez@streamhpc.com>
---
 test/batched_gemm/CMakeLists.txt              |  16 +-
 ...hed_gemm_xdl.cpp => test_batched_gemm.cpp} |  13 +
 test/batched_gemm/test_batched_gemm_wmma.cpp  | 268 ------------------
 test/batched_gemm_gemm/CMakeLists.txt         |  20 +-
 ...dl.cpp => test_batched_gemm_gemm_fp16.cpp} |  23 +-
 ...atched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp | 128 ---------
 test/gemm_add/CMakeLists.txt                  |  55 ++--
 ...st_gemm_add_wmma.cpp => test_gemm_add.cpp} |   6 +-
 ...lu_wmma.cpp => test_gemm_add_fastgelu.cpp} |   6 +-
 test/gemm_add/test_gemm_add_fastgelu_xdl.cpp  |  33 ---
 ...dd_relu_xdl.cpp => test_gemm_add_relu.cpp} |   6 +-
 test/gemm_add/test_gemm_add_relu_wmma.cpp     |  33 ---
 ...dd_silu_xdl.cpp => test_gemm_add_silu.cpp} |   6 +-
 test/gemm_add/test_gemm_add_silu_wmma.cpp     |  33 ---
 test/gemm_add/test_gemm_add_xdl.cpp           |  32 ---
 test/gemm_b_scale/CMakeLists.txt              |  16 +-
 ...b_scale_wmma.cpp => test_gemm_b_scale.cpp} |   0
 test/gemm_b_scale/test_gemm_b_scale_xdl.cpp   |  45 ---
 test/gemm_multi_abd/CMakeLists.txt            |  18 +-
 ...i_abd_wmma.cpp => test_gemm_multi_abd.cpp} |   0
 .../test_gemm_multi_abd_xdl.cpp               | 154 ----------
 test/gemm_universal/CMakeLists.txt            |  41 ++-
 ..._bf16.cpp => test_gemm_universal_bf16.cpp} |  14 +-
 ..._fp16.cpp => test_gemm_universal_fp16.cpp} |   0
 ...dl_fp8.cpp => test_gemm_universal_fp8.cpp} |  31 +-
 .../test_gemm_universal_wmma_fp8.cpp          |  78 -----
 .../test_gemm_universal_xdl_bf16.cpp          |  99 -------
 .../test_gemm_universal_xdl_fp16.cpp          | 111 --------
 test/grouped_convnd_bwd_data/CMakeLists.txt   |  13 +-
 ...est_grouped_convnd_bwd_data_interface.cpp} | 163 ++++++++++-
 ...grouped_convnd_bwd_data_interface_wmma.cpp | 186 ------------
 31 files changed, 307 insertions(+), 1340 deletions(-)
 rename test/batched_gemm/{test_batched_gemm_xdl.cpp => test_batched_gemm.cpp} (96%)
 delete mode 100644 test/batched_gemm/test_batched_gemm_wmma.cpp
 rename test/batched_gemm_gemm/{test_batched_gemm_gemm_fp16_xdl.cpp => test_batched_gemm_gemm_fp16.cpp} (94%)
 delete mode 100644 test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp
 rename test/gemm_add/{test_gemm_add_wmma.cpp => test_gemm_add.cpp} (83%)
 rename test/gemm_add/{test_gemm_add_fastgelu_wmma.cpp => test_gemm_add_fastgelu.cpp} (80%)
 delete mode 100644 test/gemm_add/test_gemm_add_fastgelu_xdl.cpp
 rename test/gemm_add/{test_gemm_add_relu_xdl.cpp => test_gemm_add_relu.cpp} (80%)
 delete mode 100644 test/gemm_add/test_gemm_add_relu_wmma.cpp
 rename test/gemm_add/{test_gemm_add_silu_xdl.cpp => test_gemm_add_silu.cpp} (80%)
 delete mode 100644 test/gemm_add/test_gemm_add_silu_wmma.cpp
 delete mode 100644 test/gemm_add/test_gemm_add_xdl.cpp
 rename test/gemm_b_scale/{test_gemm_b_scale_wmma.cpp => test_gemm_b_scale.cpp} (100%)
 delete mode 100644 test/gemm_b_scale/test_gemm_b_scale_xdl.cpp
 rename test/gemm_multi_abd/{test_gemm_multi_abd_wmma.cpp => test_gemm_multi_abd.cpp} (100%)
 delete mode 100644 test/gemm_multi_abd/test_gemm_multi_abd_xdl.cpp
 rename test/gemm_universal/{test_gemm_universal_wmma_bf16.cpp => test_gemm_universal_bf16.cpp} (95%)
 rename test/gemm_universal/{test_gemm_universal_wmma_fp16.cpp => test_gemm_universal_fp16.cpp} (100%)
 rename test/gemm_universal/{test_gemm_universal_xdl_fp8.cpp => test_gemm_universal_fp8.cpp} (76%)
 delete mode 100644 test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
 delete mode 100644 test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
 delete mode 100644 test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
 rename test/grouped_convnd_bwd_data/{test_grouped_convnd_bwd_data_interface_xdl.cpp => test_grouped_convnd_bwd_data_interface.cpp} (52%)
 delete mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp

diff --git a/test/batched_gemm/CMakeLists.txt b/test/batched_gemm/CMakeLists.txt
index 926fafcc97..02f42d4427 100644
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -1,12 +1,12 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-add_gtest_executable(test_batched_gemm_xdl test_batched_gemm_xdl.cpp)
-if(result EQUAL 0)
-   target_link_libraries(test_batched_gemm_xdl PRIVATE utility device_batched_gemm_instance)
-endif()
-
-add_gtest_executable(test_batched_gemm_wmma test_batched_gemm_wmma.cpp)
-if(result EQUAL 0)
-   target_link_libraries(test_batched_gemm_wmma PRIVATE utility device_batched_gemm_instance)
+# NOTE: We test for XDL/WMMA support here instead of relying on the usual pattern matching in the parent CMakeLists. This is necessary
+# as these tests are universal and dont have "xdl" or "wmma" in their name to signify their target arch. But they will fail to link
+# the instance library if there's no instances present for the current arch.
+if (CK_USE_XDL OR CK_USE_WMMA) 
+   add_gtest_executable(test_batched_gemm test_batched_gemm.cpp)
+   if(result EQUAL 0)
+      target_link_libraries(test_batched_gemm PRIVATE utility device_batched_gemm_instance)
+   endif()
 endif()
diff --git a/test/batched_gemm/test_batched_gemm_xdl.cpp b/test/batched_gemm/test_batched_gemm.cpp
similarity index 96%
rename from test/batched_gemm/test_batched_gemm_xdl.cpp
rename to test/batched_gemm/test_batched_gemm.cpp
index 88170f9909..82068b4170 100644
--- a/test/batched_gemm/test_batched_gemm_xdl.cpp
+++ b/test/batched_gemm/test_batched_gemm.cpp
@@ -214,6 +214,13 @@ TEST_F(TestBatchedGemm, bf16)
     this->params.push_back({68, 68, 68, 2});
     this->params.push_back({40, 40, 40, 2});
     this->params.push_back({256, 256, 128, 3});
+
+    // Tests with larger MNK
+    this->params.push_back({512, 256, 128, 1});
+    this->params.push_back({256, 240, 192, 2});
+    this->params.push_back({256, 256, 128, 3});
+    this->params.push_back({240, 128, 128, 5});
+
     this->template Run<ck::bhalf_t>();
 }
 #endif
@@ -226,7 +233,13 @@ TEST_F(TestBatchedGemm, fp16)
     this->params.push_back({60, 60, 60, 2});
     this->params.push_back({68, 68, 68, 2});
     this->params.push_back({40, 40, 40, 2});
+
+    // Tests with larger MNK
+    this->params.push_back({512, 256, 128, 1});
+    this->params.push_back({256, 240, 192, 2});
     this->params.push_back({256, 256, 128, 3});
+    this->params.push_back({240, 128, 128, 5});
+
     this->template Run<ck::half_t>();
 }
 #endif
diff --git a/test/batched_gemm/test_batched_gemm_wmma.cpp b/test/batched_gemm/test_batched_gemm_wmma.cpp
deleted file mode 100644
index db751cf7d1..0000000000
--- a/test/batched_gemm/test_batched_gemm_wmma.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <tuple>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "profiler/profile_batched_gemm_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-static ck::index_t param_mask     = 0xffff;
-static ck::index_t instance_index = -1;
-struct GemmParams
-{
-    ck::index_t M;
-    ck::index_t N;
-    ck::index_t K;
-    ck::index_t BatchCount;
-};
-
-class TestBatchedGemm : public ::testing::Test
-{
-    protected:
-    using Row = ck::tensor_layout::gemm::RowMajor;
-    using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    std::vector<GemmParams> params;
-
-    template <typename DataType>
-    void Run()
-    {
-        using namespace ck::tensor_operation::device;
-
-        bool pass = true;
-        for(size_t i = 0; i < params.size(); i++)
-        {
-            if((param_mask & (1 << i)) == 0)
-            {
-                continue;
-            }
-            auto& param           = params[i];
-            const auto M          = param.M;
-            const auto N          = param.N;
-            const auto K          = param.K;
-            const auto BatchCount = param.BatchCount;
-
-            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                   DataType,
-                                                                   DataType,
-                                                                   Row,
-                                                                   Row,
-                                                                   Row,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   DeviceBatchedGemm<Row,
-                                                                                     Row,
-                                                                                     Row,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     PassThrough,
-                                                                                     PassThrough,
-                                                                                     PassThrough>>(
-                               true,
-                               1,
-                               false,
-                               1,
-                               M,
-                               N,
-                               K,
-                               K,
-                               N,
-                               N,
-                               M * K,
-                               K * N,
-                               M * N,
-                               BatchCount,
-                               instance_index);
-
-            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                   DataType,
-                                                                   DataType,
-                                                                   Row,
-                                                                   Col,
-                                                                   Row,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   DeviceBatchedGemm<Row,
-                                                                                     Col,
-                                                                                     Row,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     PassThrough,
-                                                                                     PassThrough,
-                                                                                     PassThrough>>(
-                               true,
-                               1,
-                               false,
-                               1,
-                               M,
-                               N,
-                               K,
-                               K,
-                               K,
-                               N,
-                               M * K,
-                               K * N,
-                               M * N,
-                               BatchCount,
-                               instance_index);
-
-            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                   DataType,
-                                                                   DataType,
-                                                                   Col,
-                                                                   Row,
-                                                                   Row,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   DeviceBatchedGemm<Col,
-                                                                                     Row,
-                                                                                     Row,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     PassThrough,
-                                                                                     PassThrough,
-                                                                                     PassThrough>>(
-                               true,
-                               1,
-                               false,
-                               1,
-                               M,
-                               N,
-                               K,
-                               M,
-                               N,
-                               N,
-                               M * K,
-                               K * N,
-                               M * N,
-                               BatchCount,
-                               instance_index);
-
-            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                   DataType,
-                                                                   DataType,
-                                                                   Col,
-                                                                   Col,
-                                                                   Row,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   PassThrough,
-                                                                   DeviceBatchedGemm<Col,
-                                                                                     Col,
-                                                                                     Row,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     DataType,
-                                                                                     PassThrough,
-                                                                                     PassThrough,
-                                                                                     PassThrough>>(
-                               true,
-                               1,
-                               false,
-                               1,
-                               M,
-                               N,
-                               K,
-                               M,
-                               K,
-                               N,
-                               M * K,
-                               K * N,
-                               M * N,
-                               BatchCount,
-                               instance_index);
-        }
-        EXPECT_TRUE(pass);
-    }
-};
-
-// #ifdef CK_ENABLE_INT8
-// TEST_F(TestBatchedGemm, i8)
-// {
-//     this->params.push_back({64, 64, 64, 2});
-//     this->params.push_back({64, 64, 64, 1});
-//     this->params.push_back({60, 60, 60, 2});
-//     this->params.push_back({68, 68, 68, 2});
-//     this->params.push_back({40, 40, 40, 2});
-//     this->params.push_back({256, 256, 128, 3});
-//     this->template Run<int8_t>();
-// }
-// #endif
-
-#ifdef CK_ENABLE_BF16
-TEST_F(TestBatchedGemm, bf16)
-{
-    this->params.push_back({64, 64, 64, 2});
-    this->params.push_back({64, 64, 64, 1});
-    this->params.push_back({40, 40, 40, 2});
-    this->params.push_back({256, 256, 128, 3});
-
-    // Tests with larger MNK
-    this->params.push_back({512, 256, 128, 1});
-    this->params.push_back({256, 240, 192, 2});
-    this->params.push_back({256, 256, 128, 3});
-    this->params.push_back({240, 128, 128, 5});
-    this->template Run<ck::bhalf_t>();
-}
-#endif
-
-#ifdef CK_ENABLE_FP16
-TEST_F(TestBatchedGemm, fp16)
-{
-    this->params.push_back({64, 64, 64, 2});
-    this->params.push_back({64, 64, 64, 1});
-    this->params.push_back({40, 40, 40, 2});
-    this->params.push_back({256, 256, 128, 3});
-
-    // Tests with larger MNK
-    this->params.push_back({512, 256, 128, 1});
-    this->params.push_back({256, 240, 192, 2});
-    this->params.push_back({256, 256, 128, 3});
-    this->params.push_back({240, 128, 128, 5});
-    this->template Run<ck::half_t>();
-}
-#endif
-
-// #ifdef CK_ENABLE_FP32
-// TEST_F(TestBatchedGemm, fp32)
-// {
-//     this->params.push_back({64, 64, 64, 2});
-//     this->params.push_back({64, 64, 64, 1});
-//     this->params.push_back({60, 60, 60, 2});
-//     this->params.push_back({68, 68, 68, 2});
-//     this->params.push_back({40, 40, 40, 2});
-//     this->params.push_back({256, 256, 128, 3});
-//     this->template Run<float>();
-// }
-// #endif
-
-int main(int argc, char** argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    if(argc == 1) {}
-    else if(argc == 3)
-    {
-        param_mask     = strtol(argv[1], nullptr, 0);
-        instance_index = atoi(argv[2]);
-    }
-    else
-    {
-        std::cout << "Usage of " << argv[0] << std::endl;
-        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
-    }
-    return RUN_ALL_TESTS();
-}
diff --git a/test/batched_gemm_gemm/CMakeLists.txt b/test/batched_gemm_gemm/CMakeLists.txt
index a12d5c3435..a66b011b33 100644
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -1,17 +1,17 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-add_gtest_executable(test_batched_gemm_gemm_fp16_xdl test_batched_gemm_gemm_fp16_xdl.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_batched_gemm_gemm_fp16_xdl PRIVATE utility device_batched_gemm_gemm_instance)
+# NOTE: We test for XDL/WMMA support here instead of relying on the usual pattern matching in the parent CMakeLists. This is necessary
+# as these tests are universal and dont have "xdl" or "wmma" in their name to signify their target arch. But they will fail to link
+# the instance library if there's no instances present for the current arch.
+if (CK_USE_XDL OR CK_USE_WMMA) 
+  add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
+  endif()
 endif()
 
-add_gtest_executable(test_batched_gemm_gemm_bf16_wmma test_batched_gemm_gemm_bf16_wmma_cshuffle_v3.cpp)
+add_gtest_executable(test_batched_gemm_gemm_bf16_wmma_cshuffle_v3 test_batched_gemm_gemm_bf16_wmma_cshuffle_v3.cpp)
 if(result EQUAL 0)
-  target_link_libraries(test_batched_gemm_gemm_bf16_wmma PRIVATE utility device_batched_gemm_gemm_instance)
-endif()
-
-add_gtest_executable(test_batched_gemm_gemm_fp16_wmma test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_batched_gemm_gemm_fp16_wmma PRIVATE utility device_batched_gemm_gemm_instance)
+  target_link_libraries(test_batched_gemm_gemm_bf16_wmma_cshuffle_v3 PRIVATE utility device_batched_gemm_gemm_instance)
 endif()
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_xdl.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
similarity index 94%
rename from test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_xdl.cpp
rename to test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
index 011e53a99a..8d6405e618 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_xdl.cpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
@@ -136,13 +136,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestBatchedGemmGemmFP16, KernelTypes);
 
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16) { this->Run(); }
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16)
+{
+    this->bench_  = false;
+    this->verify_ = true;
+    this->Run();
+}
 
 TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadM)
 {
     this->lengths_ = std::vector<std::vector<int>>{
         {136, 128, 32, 128, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
@@ -151,6 +158,8 @@ TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadN)
     this->lengths_ = std::vector<std::vector<int>>{
         {128, 136, 32, 128, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
@@ -160,6 +169,8 @@ TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadK)
         {128, 128, 40, 128, 1},
         {128, 128, 136, 128, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
@@ -168,6 +179,8 @@ TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadO)
     this->lengths_ = std::vector<std::vector<int>>{
         {128, 128, 32, 136, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
@@ -176,6 +189,8 @@ TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddM)
     this->lengths_ = std::vector<std::vector<int>>{
         {129, 128, 32, 128, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
@@ -184,6 +199,8 @@ TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddN)
     this->lengths_ = std::vector<std::vector<int>>{
         {128, 129, 32, 128, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
@@ -193,6 +210,8 @@ TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddK)
         {128, 128, 33, 128, 1},
         {128, 128, 129, 128, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
@@ -202,6 +221,8 @@ TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddO)
     this->lengths_ = std::vector<std::vector<int>>{
         {128, 128, 32, 129, 1},
     };
+    this->bench_  = false;
+    this->verify_ = true;
     this->Run();
 }
 
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp
deleted file mode 100644
index da97a95f4e..0000000000
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16_wmma_cshuffle_v3.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "gtest/gtest.h"
-#include "test_batched_gemm_gemm_util.hpp"
-
-template <typename Tuple>
-class TestBatchedGemmGemmFP16 : public TestBatchedGemmGemm<Tuple>
-{
-};
-
-// clang-format off
-using KernelTypes = ::testing::Types<
-    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>,
-    std::tuple<F16, F16, F16, F16, Row, Col, Col, Row>
-    >;
-// clang-format on
-
-TYPED_TEST_SUITE(TestBatchedGemmGemmFP16, KernelTypes);
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16)
-{
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadM)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {136, 128, 32, 128, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadN)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {128, 136, 32, 128, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadK)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {128, 128, 40, 128, 1},
-        {128, 128, 136, 128, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadO)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {128, 128, 32, 136, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddM)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {129, 128, 32, 128, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddN)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {128, 129, 32, 128, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddK)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {128, 128, 33, 128, 1},
-        {128, 128, 129, 128, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-// If kernel B1Layout is RowMajor, expect not to support odd O size
-TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddO)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {128, 128, 32, 129, 1},
-    };
-    this->bench_  = true;
-    this->verify_ = true;
-    this->Run();
-}
-
-TYPED_TEST(TestBatchedGemmGemmFP16, DISABLED_Bench_FP16)
-{
-    this->lengths_ = std::vector<std::vector<int>>{
-        {256, 256, 64, 64, 768},
-        {256, 256, 128, 128, 768},
-        {512, 512, 64, 64, 768},
-        {512, 512, 128, 128, 768},
-        {1024, 1024, 64, 64, 768},
-        {1024, 1024, 128, 128, 768},
-        {2048, 2048, 64, 64, 768},
-        {2048, 2048, 128, 128, 768},
-        {4096, 4096, 64, 64, 768},
-        {4096, 4096, 128, 128, 768},
-    };
-    this->bench_  = true;
-    this->verify_ = false;
-    this->Run();
-}
diff --git a/test/gemm_add/CMakeLists.txt b/test/gemm_add/CMakeLists.txt
index 17bfadf95d..4cfd6abdc1 100644
--- a/test/gemm_add/CMakeLists.txt
+++ b/test/gemm_add/CMakeLists.txt
@@ -3,29 +3,29 @@
 
 # Implements test instances for MultipleD with xdl and wmma support.
 
-add_gtest_executable(test_gemm_add_xdl test_gemm_add_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_xdl PRIVATE utility device_gemm_add_instance)
-endif()
+# NOTE: We test for XDL/WMMA support here instead of relying on the usual pattern matching in the parent CMakeLists. This is necessary
+# as these tests are universal and dont have "xdl" or "wmma" in their name to signify their target arch. But they will fail to link
+# the instance library if there's no instances present for the current arch.
+if (CK_USE_XDL OR CK_USE_WMMA) 
+    add_gtest_executable(test_gemm_add test_gemm_add.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_add PRIVATE utility device_gemm_add_instance)
+    endif()
 
-add_gtest_executable(test_gemm_add_relu_xdl test_gemm_add_relu_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_relu_xdl PRIVATE utility device_gemm_add_instance device_gemm_add_relu_instance)
-endif()
+    add_gtest_executable(test_gemm_add_relu test_gemm_add_relu.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_add_relu PRIVATE utility device_gemm_add_instance device_gemm_add_relu_instance)
+    endif()
 
-add_gtest_executable(test_gemm_add_silu_xdl test_gemm_add_silu_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_silu_xdl PRIVATE utility device_gemm_add_instance device_gemm_add_silu_instance)
-endif()
+    add_gtest_executable(test_gemm_add_silu test_gemm_add_silu.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_add_silu PRIVATE utility device_gemm_add_instance device_gemm_add_silu_instance)
+    endif()
 
-add_gtest_executable(test_gemm_add_silu_wmma test_gemm_add_silu_wmma.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_silu_wmma PRIVATE utility device_gemm_add_instance device_gemm_add_silu_instance)
-endif()
-
-add_gtest_executable(test_gemm_add_fastgelu_xdl test_gemm_add_fastgelu_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_fastgelu_xdl PRIVATE utility device_gemm_add_instance device_gemm_add_fastgelu_instance)
+    add_gtest_executable(test_gemm_add_fastgelu test_gemm_add_fastgelu.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_add_fastgelu PRIVATE utility device_gemm_add_fastgelu_instance)
+    endif()
 endif()
 
 add_gtest_executable(test_gemm_fastgelu_wmma test_gemm_fastgelu_wmma.cpp)
@@ -33,16 +33,6 @@ if(result EQUAL 0)
     target_link_libraries(test_gemm_fastgelu_wmma PRIVATE utility device_gemm_fastgelu_instance)
 endif()
 
-add_gtest_executable(test_gemm_add_fastgelu_wmma test_gemm_add_fastgelu_wmma.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_fastgelu_wmma PRIVATE utility device_gemm_add_fastgelu_instance)
-endif()
-
-add_gtest_executable(test_gemm_add_wmma test_gemm_add_wmma.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_wmma PRIVATE utility device_gemm_add_instance)
-endif()
-
 add_gtest_executable(test_gemm_add_add_fastgelu_wmma test_gemm_add_add_fastgelu_wmma.cpp)
 if(result EQUAL 0)
     target_link_libraries(test_gemm_add_add_fastgelu_wmma PRIVATE utility device_gemm_add_add_fastgelu_instance)
@@ -66,9 +56,4 @@ endif()
 add_gtest_executable(test_gemm_bilinear_wmma test_gemm_bilinear_wmma.cpp)
 if(result EQUAL 0)
     target_link_libraries(test_gemm_bilinear_wmma PRIVATE utility device_gemm_bilinear_instance)
-endif()
-
-add_gtest_executable(test_gemm_add_relu_wmma test_gemm_add_relu_wmma.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_add_relu_wmma PRIVATE utility device_gemm_add_relu_instance)
 endif()
\ No newline at end of file
diff --git a/test/gemm_add/test_gemm_add_wmma.cpp b/test/gemm_add/test_gemm_add.cpp
similarity index 83%
rename from test/gemm_add/test_gemm_add_wmma.cpp
rename to test/gemm_add/test_gemm_add.cpp
index bc440a9ae8..61f4372c76 100644
--- a/test/gemm_add/test_gemm_add_wmma.cpp
+++ b/test/gemm_add/test_gemm_add.cpp
@@ -26,7 +26,9 @@ class TestGemmAdd : public TestGemmD0Common<Tuple>
 };
 
 using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>,
+                                     std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>>;
 
 TYPED_TEST_SUITE(TestGemmAdd, KernelTypes);
-TYPED_TEST(TestGemmAdd, Test_BF16FP16) { this->Run(); }
+TYPED_TEST(TestGemmAdd, Test_BF16FP16_FP16FP16_INT8) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_fastgelu_wmma.cpp b/test/gemm_add/test_gemm_add_fastgelu.cpp
similarity index 80%
rename from test/gemm_add/test_gemm_add_fastgelu_wmma.cpp
rename to test/gemm_add/test_gemm_add_fastgelu.cpp
index e72b1c3761..2e3fe24e3c 100644
--- a/test/gemm_add/test_gemm_add_fastgelu_wmma.cpp
+++ b/test/gemm_add/test_gemm_add_fastgelu.cpp
@@ -26,10 +26,12 @@ class TestGemmAddFastgelu : public TestGemmD0Common<Tuple>
     }
 };
 
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
                                      std::tuple<F16, F16, F32, F16, F16, Row, Col, Row, Row>,
                                      std::tuple<F16, F16, F32, F16, F16, Col, Row, Row, Row>,
                                      std::tuple<F16, F16, F32, F16, F16, Col, Col, Row, Row>>;
 
 TYPED_TEST_SUITE(TestGemmAddFastgelu, KernelTypes);
-TYPED_TEST(TestGemmAddFastgelu, Test_FP16FP16) { this->Run(); }
+TYPED_TEST(TestGemmAddFastgelu, Test_BF16FP16_FP16FP16_INT8) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_fastgelu_xdl.cpp b/test/gemm_add/test_gemm_add_fastgelu_xdl.cpp
deleted file mode 100644
index 21c5b47f88..0000000000
--- a/test/gemm_add/test_gemm_add_fastgelu_xdl.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "gtest/gtest.h"
-#include "ck/ck.hpp"
-#include "profiler/profile_gemm_add_fastgelu_impl.hpp"
-#include "test_gemm_common.hpp"
-
-template <typename Tuple>
-class TestGemmAddFastgelu : public TestGemmD0Common<Tuple>
-{
-    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
-
-    ProfileCall GetImpl() override
-    {
-        return ck::profiler::profile_gemm_add_fastgelu_impl<
-            typename TestGemmD0Common<Tuple>::ADataType,
-            typename TestGemmD0Common<Tuple>::BDataType,
-            typename TestGemmD0Common<Tuple>::AccDataType,
-            typename TestGemmD0Common<Tuple>::D0DataType,
-            typename TestGemmD0Common<Tuple>::EDataType,
-            typename TestGemmD0Common<Tuple>::ALayout,
-            typename TestGemmD0Common<Tuple>::BLayout,
-            typename TestGemmD0Common<Tuple>::D0Layout,
-            typename TestGemmD0Common<Tuple>::ELayout>;
-    }
-};
-
-using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>>;
-
-TYPED_TEST_SUITE(TestGemmAddFastgelu, KernelTypes);
-TYPED_TEST(TestGemmAddFastgelu, Test_BF16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_relu_xdl.cpp b/test/gemm_add/test_gemm_add_relu.cpp
similarity index 80%
rename from test/gemm_add/test_gemm_add_relu_xdl.cpp
rename to test/gemm_add/test_gemm_add_relu.cpp
index d87ac74188..649017ddcb 100644
--- a/test/gemm_add/test_gemm_add_relu_xdl.cpp
+++ b/test/gemm_add/test_gemm_add_relu.cpp
@@ -27,7 +27,9 @@ class TestGemmAddRelu : public TestGemmD0Common<Tuple>
 };
 
 using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>>;
+                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
 
 TYPED_TEST_SUITE(TestGemmAddRelu, KernelTypes);
-TYPED_TEST(TestGemmAddRelu, Test_BF16FP16_INT8) { this->Run(); }
+TYPED_TEST(TestGemmAddRelu, Test_BF16FP16_FP16FP16_INT8) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_relu_wmma.cpp b/test/gemm_add/test_gemm_add_relu_wmma.cpp
deleted file mode 100644
index 1d099f8bcf..0000000000
--- a/test/gemm_add/test_gemm_add_relu_wmma.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "gtest/gtest.h"
-#include "ck/ck.hpp"
-#include "profiler/profile_gemm_add_relu_impl.hpp"
-#include "test_gemm_common.hpp"
-
-template <typename Tuple>
-class TestGemmAddRelu : public TestGemmD0Common<Tuple>
-{
-    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
-
-    ProfileCall GetImpl() override
-    {
-        return ck::profiler::profile_gemm_add_relu_impl<
-            typename TestGemmD0Common<Tuple>::ADataType,
-            typename TestGemmD0Common<Tuple>::BDataType,
-            typename TestGemmD0Common<Tuple>::AccDataType,
-            typename TestGemmD0Common<Tuple>::D0DataType,
-            typename TestGemmD0Common<Tuple>::EDataType,
-            typename TestGemmD0Common<Tuple>::ALayout,
-            typename TestGemmD0Common<Tuple>::BLayout,
-            typename TestGemmD0Common<Tuple>::D0Layout,
-            typename TestGemmD0Common<Tuple>::ELayout>;
-    }
-};
-
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
-
-TYPED_TEST_SUITE(TestGemmAddRelu, KernelTypes);
-TYPED_TEST(TestGemmAddRelu, Test_BF16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_silu_xdl.cpp b/test/gemm_add/test_gemm_add_silu.cpp
similarity index 80%
rename from test/gemm_add/test_gemm_add_silu_xdl.cpp
rename to test/gemm_add/test_gemm_add_silu.cpp
index 3af279c286..64b51b8b1b 100644
--- a/test/gemm_add/test_gemm_add_silu_xdl.cpp
+++ b/test/gemm_add/test_gemm_add_silu.cpp
@@ -27,7 +27,9 @@ class TestGemmAddSilu : public TestGemmD0Common<Tuple>
 };
 
 using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>>;
+                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
 
 TYPED_TEST_SUITE(TestGemmAddSilu, KernelTypes);
-TYPED_TEST(TestGemmAddSilu, Test_BF16FP16_INT8) { this->Run(); }
+TYPED_TEST(TestGemmAddSilu, Test_BF16FP16_BF16FP16_INT8) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_silu_wmma.cpp b/test/gemm_add/test_gemm_add_silu_wmma.cpp
deleted file mode 100644
index f68f67a36f..0000000000
--- a/test/gemm_add/test_gemm_add_silu_wmma.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "gtest/gtest.h"
-#include "ck/ck.hpp"
-#include "profiler/profile_gemm_add_silu_impl.hpp"
-#include "test_gemm_common.hpp"
-
-template <typename Tuple>
-class TestGemmAddSilu : public TestGemmD0Common<Tuple>
-{
-    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
-
-    ProfileCall GetImpl() override
-    {
-        return ck::profiler::profile_gemm_add_silu_impl<
-            typename TestGemmD0Common<Tuple>::ADataType,
-            typename TestGemmD0Common<Tuple>::BDataType,
-            typename TestGemmD0Common<Tuple>::AccDataType,
-            typename TestGemmD0Common<Tuple>::D0DataType,
-            typename TestGemmD0Common<Tuple>::EDataType,
-            typename TestGemmD0Common<Tuple>::ALayout,
-            typename TestGemmD0Common<Tuple>::BLayout,
-            typename TestGemmD0Common<Tuple>::D0Layout,
-            typename TestGemmD0Common<Tuple>::ELayout>;
-    }
-};
-
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, BF16, F32, BF16, BF16, Row, Row, Row, Row>>;
-
-TYPED_TEST_SUITE(TestGemmAddSilu, KernelTypes);
-TYPED_TEST(TestGemmAddSilu, Test_BF16FP16_BF16FP16) { this->Run(); }
diff --git a/test/gemm_add/test_gemm_add_xdl.cpp b/test/gemm_add/test_gemm_add_xdl.cpp
deleted file mode 100644
index 873e87edd4..0000000000
--- a/test/gemm_add/test_gemm_add_xdl.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "gtest/gtest.h"
-#include "ck/ck.hpp"
-#include "profiler/profile_gemm_add_impl.hpp"
-#include "test_gemm_common.hpp"
-
-template <typename Tuple>
-class TestGemmAdd : public TestGemmD0Common<Tuple>
-{
-    using ProfileCall = typename TestGemmD0Common<Tuple>::ProfileCall;
-
-    ProfileCall GetImpl() override
-    {
-        return ck::profiler::profile_gemm_add_impl<typename TestGemmD0Common<Tuple>::ADataType,
-                                                   typename TestGemmD0Common<Tuple>::BDataType,
-                                                   typename TestGemmD0Common<Tuple>::AccDataType,
-                                                   typename TestGemmD0Common<Tuple>::D0DataType,
-                                                   typename TestGemmD0Common<Tuple>::EDataType,
-                                                   typename TestGemmD0Common<Tuple>::ALayout,
-                                                   typename TestGemmD0Common<Tuple>::BLayout,
-                                                   typename TestGemmD0Common<Tuple>::D0Layout,
-                                                   typename TestGemmD0Common<Tuple>::ELayout>;
-    }
-};
-
-using KernelTypes = ::testing::Types<std::tuple<F16, I8, F32, F16, F16, Row, Row, Row, Row>,
-                                     std::tuple<BF16, I8, F32, BF16, BF16, Row, Row, Row, Row>>;
-
-TYPED_TEST_SUITE(TestGemmAdd, KernelTypes);
-TYPED_TEST(TestGemmAdd, Test_BF16FP16_INT8) { this->Run(); }
diff --git a/test/gemm_b_scale/CMakeLists.txt b/test/gemm_b_scale/CMakeLists.txt
index 517e2f01f6..b386ec67df 100644
--- a/test/gemm_b_scale/CMakeLists.txt
+++ b/test/gemm_b_scale/CMakeLists.txt
@@ -1,12 +1,12 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-add_gtest_executable(test_gemm_b_scale_xdl test_gemm_b_scale_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_b_scale_xdl PRIVATE utility device_gemm_b_scale_instance)
-endif()
-
-add_gtest_executable(test_gemm_b_scale_wmma test_gemm_b_scale_wmma.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_b_scale_wmma PRIVATE utility device_gemm_b_scale_instance)
+# NOTE: We test for XDL/WMMA support here instead of relying on the usual pattern matching in the parent CMakeLists. This is necessary
+# as these tests are universal and dont have "xdl" or "wmma" in their name to signify their target arch. But they will fail to link
+# the instance library if there's no instances present for the current arch.
+if (CK_USE_XDL OR CK_USE_WMMA)
+    add_gtest_executable(test_gemm_b_scale test_gemm_b_scale.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_b_scale PRIVATE utility device_gemm_b_scale_instance)
+    endif()
 endif()
diff --git a/test/gemm_b_scale/test_gemm_b_scale_wmma.cpp b/test/gemm_b_scale/test_gemm_b_scale.cpp
similarity index 100%
rename from test/gemm_b_scale/test_gemm_b_scale_wmma.cpp
rename to test/gemm_b_scale/test_gemm_b_scale.cpp
diff --git a/test/gemm_b_scale/test_gemm_b_scale_xdl.cpp b/test/gemm_b_scale/test_gemm_b_scale_xdl.cpp
deleted file mode 100644
index 93eb128bb0..0000000000
--- a/test/gemm_b_scale/test_gemm_b_scale_xdl.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_gemm_b_scale_util.hpp"
-
-using I4  = ck::pk_i4_t;
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-namespace {
-
-template <typename X, typename Y>
-struct tuple_concat;
-
-template <typename... Xs, typename... Ys>
-struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
-{
-    using type = std::tuple<Xs..., Ys...>;
-};
-
-} // namespace
-
-template <typename Tuple>
-class TestGemmBScale_MK_NK
-    : public ck::test::TestGemmBScale<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
-{
-};
-
-// clang-format off
-using KernelTypes_MK_NK = ::testing::Types<
-    //         ADataType, BDataType, BScaleDataType, ComputeDataType, CDataType
-    std::tuple<      F16,        I4,            F16,             F16,       F16>
-    >;
-// clang-format on
-
-TYPED_TEST_SUITE(TestGemmBScale_MK_NK, KernelTypes_MK_NK);
-
-#include "test_gemm_b_scale_ut_cases.inc"
diff --git a/test/gemm_multi_abd/CMakeLists.txt b/test/gemm_multi_abd/CMakeLists.txt
index 9b1454ca93..2e327454f2 100644
--- a/test/gemm_multi_abd/CMakeLists.txt
+++ b/test/gemm_multi_abd/CMakeLists.txt
@@ -1,12 +1,12 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-add_gtest_executable(test_gemm_multi_abd_wmma test_gemm_multi_abd_wmma.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_multi_abd_wmma PRIVATE utility device_gemm_multi_abd_instance)
-endif()
-
-add_gtest_executable(test_gemm_multi_abd_xdl test_gemm_multi_abd_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_multi_abd_xdl PRIVATE utility device_gemm_multi_abd_instance)
-endif()
+# NOTE: We test for XDL/WMMA support here instead of relying on the usual pattern matching in the parent CMakeLists. This is necessary
+# as these tests are universal and dont have "xdl" or "wmma" in their name to signify their target arch. But they will fail to link
+# the instance library if there's no instances present for the current arch.
+if (CK_USE_XDL OR CK_USE_WMMA) 
+    add_gtest_executable(test_gemm_multi_abd test_gemm_multi_abd.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_multi_abd PRIVATE utility device_gemm_multi_abd_instance)
+    endif()
+endif()
\ No newline at end of file
diff --git a/test/gemm_multi_abd/test_gemm_multi_abd_wmma.cpp b/test/gemm_multi_abd/test_gemm_multi_abd.cpp
similarity index 100%
rename from test/gemm_multi_abd/test_gemm_multi_abd_wmma.cpp
rename to test/gemm_multi_abd/test_gemm_multi_abd.cpp
diff --git a/test/gemm_multi_abd/test_gemm_multi_abd_xdl.cpp b/test/gemm_multi_abd/test_gemm_multi_abd_xdl.cpp
deleted file mode 100644
index ed3fbbf087..0000000000
--- a/test/gemm_multi_abd/test_gemm_multi_abd_xdl.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/ck.hpp"
-#include "profiler/profile_gemm_multi_abd_impl.hpp"
-#include "test_gemm_common.hpp"
-
-namespace ck {
-namespace test {
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-using I8   = int8_t;
-using BF16 = ck::bhalf_t;
-
-using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
-using Multiply            = ck::tensor_operation::element_wise::Multiply;
-using Add                 = ck::tensor_operation::element_wise::Add;
-using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
-using FastGelu            = ck::tensor_operation::element_wise::FastGelu;
-using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
-using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
-using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
-
-using KernelTypesABD = ::testing::Types<std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row, Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   Add>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Col, Col>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   Add>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row, Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   AddFastGelu>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Col, Col>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   AddFastGelu>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row, Row>,
-                                                   ck::Tuple<>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   FastGelu>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Col, Col>,
-                                                   ck::Tuple<>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   FastGelu>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row, Row>,
-                                                   ck::Tuple<>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   PassThrough>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Col, Col>,
-                                                   ck::Tuple<>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8, BF16>,
-                                                   ck::Tuple<>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   Multiply,
-                                                   PassThrough>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<Row, Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8>,
-                                                   ck::Tuple<BF16, BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   PassThrough,
-                                                   MultiplyAddFastGelu>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<Row, Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8>,
-                                                   ck::Tuple<BF16, BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   PassThrough,
-                                                   MultiplyAdd>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8>,
-                                                   ck::Tuple<BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   PassThrough,
-                                                   MultiplyFastGelu>,
-                                        std::tuple<ck::Tuple<Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<Row>,
-                                                   ck::Tuple<BF16>,
-                                                   ck::Tuple<I8>,
-                                                   ck::Tuple<BF16>,
-                                                   BF16,
-                                                   PassThrough,
-                                                   PassThrough,
-                                                   Multiply>>;
-
-TYPED_TEST_SUITE(TestGemmCommon, KernelTypesABD);
-TYPED_TEST(TestGemmCommon, Test_BF16I8BF16) { this->Run(); }
-
-} // namespace test
-} // namespace ck
diff --git a/test/gemm_universal/CMakeLists.txt b/test/gemm_universal/CMakeLists.txt
index 5be42aae90..d48343797a 100644
--- a/test/gemm_universal/CMakeLists.txt
+++ b/test/gemm_universal/CMakeLists.txt
@@ -1,32 +1,23 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-add_gtest_executable(test_gemm_universal_wmma_fp16 test_gemm_universal_wmma_fp16.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_universal_wmma_fp16 PRIVATE utility device_gemm_universal_instance)
-endif()
 
-add_gtest_executable(test_gemm_universal_wmma_bf16 test_gemm_universal_wmma_bf16.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_universal_wmma_bf16 PRIVATE utility device_gemm_universal_instance)
-endif()
+# NOTE: We test for XDL/WMMA support here instead of relying on the usual pattern matching in the parent CMakeLists. This is necessary
+# as these tests are universal and dont have "xdl" or "wmma" in their name to signify their target arch. But they will fail to link
+# the instance library if there's no instances present for the current arch.
+if (CK_USE_XDL OR CK_USE_WMMA) 
+    add_gtest_executable(test_gemm_universal_fp16 test_gemm_universal_fp16.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_universal_fp16 PRIVATE utility device_gemm_universal_instance)
+    endif()
 
-add_gtest_executable(test_gemm_universal_wmma_fp8 test_gemm_universal_wmma_fp8.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_universal_wmma_fp8 PRIVATE utility device_gemm_universal_instance)
-endif()
+    add_gtest_executable(test_gemm_universal_fp8 test_gemm_universal_fp8.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_universal_fp8 PRIVATE utility device_gemm_universal_instance)
+    endif()
 
-add_gtest_executable(test_gemm_universal_xdl_fp16 test_gemm_universal_xdl_fp16.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_universal_xdl_fp16 PRIVATE utility device_gemm_universal_instance)
-endif()
-
-add_gtest_executable(test_gemm_universal_xdl_fp8 test_gemm_universal_xdl_fp8.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_universal_xdl_fp8 PRIVATE utility device_gemm_universal_instance)
-endif()
-
-add_gtest_executable(test_gemm_universal_xdl_bf16 test_gemm_universal_xdl_bf16.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_gemm_universal_xdl_bf16 PRIVATE utility device_gemm_universal_instance)
+    add_gtest_executable(test_gemm_universal_bf16 test_gemm_universal_bf16.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_gemm_universal_bf16 PRIVATE utility device_gemm_universal_instance)
+    endif()
 endif()
diff --git a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp b/test/gemm_universal/test_gemm_universal_bf16.cpp
similarity index 95%
rename from test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
rename to test/gemm_universal/test_gemm_universal_bf16.cpp
index e9f25df162..a4306e6916 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_bf16.cpp
@@ -55,7 +55,8 @@ class TestGemmUniversal_BF16_KM_NK
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<     BF16,      BF16,            BF16,      BF16>
+
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 
 using KernelTypes_MK_NK = ::testing::Types<
@@ -66,11 +67,6 @@ using KernelTypes_MK_NK = ::testing::Types<
     std::tuple<     BF16,      BF16,            BF16,      BF16>
     >;
 
-using KernelTypes_KM_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<     BF16,      BF16,            BF16,      BF16>
-    >;
-
 using KernelTypes_KM_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
 #if defined(CK_ENABLE_FP8)
@@ -78,6 +74,12 @@ using KernelTypes_KM_NK = ::testing::Types<
 #endif
     std::tuple<     BF16,      BF16,            BF16,      BF16>
     >;
+
+using KernelTypes_KM_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+
 // clang-format on
 
 TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp b/test/gemm_universal/test_gemm_universal_fp16.cpp
similarity index 100%
rename from test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
rename to test/gemm_universal/test_gemm_universal_fp16.cpp
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp b/test/gemm_universal/test_gemm_universal_fp8.cpp
similarity index 76%
rename from test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
rename to test/gemm_universal/test_gemm_universal_fp8.cpp
index 49a0670528..636305c96a 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_fp8.cpp
@@ -44,31 +44,34 @@ class TestGemmUniversal_FP8_MK_NK
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)) && !defined(CK_USE_WMMA_FP8)
     std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+    std::tuple<       F8,       F16,             F16,     F16>>;
+#elif defined(CK_USE_WMMA_FP8)
+    // Fallback test type when WMMA FP8 is used
+    std::tuple<       F8,        F8,              F8,    BF16>>;
+#else
     // Fallback test type when FP8 is not enabled
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
+    std::tuple<      F16,       F16,             F16,     F16>>;
+#endif
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
 
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)) && !defined(CK_USE_WMMA_FP8)
     std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+    std::tuple<       F8,       F16,             F16,     F16>>;
+#elif defined(CK_USE_WMMA_FP8)
+    // Fallback test type when WMMA FP8 is used
+    std::tuple<       F8,        F8,              F8,    BF16>>;
+#else
     // Fallback test type when FP8 is not enabled
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-
+    std::tuple<      F16,       F16,             F16,     F16>>;
+#endif
+// clang-format on
 
 TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
 
-
 #include "test_gemm_universal_ut_cases_fp8.inc"
 int main(int argc, char** argv)
 {
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
deleted file mode 100644
index 5d54144747..0000000000
--- a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_gemm_universal_util.hpp"
-ck::index_t param_mask     = 0xffff;
-ck::index_t instance_index = -1;
-#if defined(CK_USE_WMMA_FP8)
-
-using F8   = ck::f8_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-namespace {
-
-template <typename X, typename Y>
-struct tuple_concat;
-
-template <typename... Xs, typename... Ys>
-struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
-{
-    using type = std::tuple<Xs..., Ys...>;
-};
-
-} // namespace
-
-template <typename Tuple>
-class TestGemmUniversal_FP8_MK_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP8_MK_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
-{
-};
-
-// clang-format off
-using KernelTypes_MK_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<       F8,        F8,              F8,      BF16>
-    >;
-
-using KernelTypes_MK_NK = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<       F8,        F8,              F8,      BF16>
-    >;
-// clang-format on
-
-TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
-
-#include "test_gemm_universal_ut_cases_fp8.inc"
-
-#endif // CK_USE_WMMA_FP8
-int main(int argc, char** argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    if(argc == 1) {}
-    else if(argc == 3)
-    {
-        param_mask     = strtol(argv[1], nullptr, 0);
-        instance_index = atoi(argv[2]);
-    }
-    else
-    {
-        std::cout << "Usage of " << argv[0] << std::endl;
-        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
-    }
-    return RUN_ALL_TESTS();
-}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
deleted file mode 100644
index 18031cd762..0000000000
--- a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_gemm_universal_util.hpp"
-ck::index_t param_mask     = 0xffff;
-ck::index_t instance_index = -1;
-using BF16                 = ck::bhalf_t;
-using F32                  = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-namespace {
-
-template <typename X, typename Y>
-struct tuple_concat;
-
-template <typename... Xs, typename... Ys>
-struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
-{
-    using type = std::tuple<Xs..., Ys...>;
-};
-
-} // namespace
-
-template <typename Tuple>
-class TestGemmUniversal_BF16_MK_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_BF16_MK_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_BF16_KM_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_BF16_KM_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
-{
-};
-
-// clang-format off
-using KernelTypes_MK_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-
-    std::tuple<     BF16,      BF16,            BF16,    BF16>
-    >;
-using KernelTypes_MK_NK = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-
-    std::tuple<     BF16,      BF16,            BF16,    BF16>
-    >;
-
-using KernelTypes_KM_NK = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<     BF16,      BF16,            BF16,    BF16>
-    >;
-
-using KernelTypes_KM_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<     BF16,      BF16,            BF16,    BF16>
-    >;
-
-// clang-format on
-
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_NK, KernelTypes_MK_NK);
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
-
-#include "test_gemm_universal_ut_cases_bf16.inc"
-int main(int argc, char** argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    if(argc == 1) {}
-    else if(argc == 3)
-    {
-        param_mask     = strtol(argv[1], nullptr, 0);
-        instance_index = atoi(argv[2]);
-    }
-    else
-    {
-        std::cout << "Usage of " << argv[0] << std::endl;
-        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
-    }
-    return RUN_ALL_TESTS();
-}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
deleted file mode 100644
index 9e99b45e80..0000000000
--- a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_gemm_universal_util.hpp"
-ck::index_t param_mask     = 0xffff;
-ck::index_t instance_index = -1;
-using F8                   = ck::f8_t;
-using F16                  = ck::half_t;
-
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-namespace {
-
-template <typename X, typename Y>
-struct tuple_concat;
-
-template <typename... Xs, typename... Ys>
-struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
-{
-    using type = std::tuple<Xs..., Ys...>;
-};
-
-} // namespace
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_MK_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_MK_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_KM_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_KM_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
-{
-};
-
-// clang-format off
-using KernelTypes_MK_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-
-#endif
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-
-using KernelTypes_MK_NK = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-
-#endif
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-
-using KernelTypes_KM_NK = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-
-using KernelTypes_KM_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-// clang-format on
-
-TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
-TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
-TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);
-
-#include "test_gemm_universal_ut_cases_fp16.inc"
-int main(int argc, char** argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    if(argc == 1) {}
-    else if(argc == 3)
-    {
-        param_mask     = strtol(argv[1], nullptr, 0);
-        instance_index = atoi(argv[2]);
-    }
-    else
-    {
-        std::cout << "Usage of " << argv[0] << std::endl;
-        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
-    }
-    return RUN_ALL_TESTS();
-}
diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt
index 0f6285cfea..514f8e9668 100644
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -21,11 +21,10 @@ if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     target_compile_options(test_grouped_conv_bwd_data_scale PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(test_grouped_conv_bwd_data_scale PRIVATE gtest_main getopt::getopt utility device_grouped_conv3d_bwd_data_scale_instance)
 endif()
-add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data_interface_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance)
-endif()
-add_gtest_executable(test_grouped_convnd_bwd_data_interface_wmma test_grouped_convnd_bwd_data_interface_wmma.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data_interface_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+
+if (CK_USE_XDL OR CK_USE_WMMA) 
+    add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+    endif()
 endif()
\ No newline at end of file
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp
similarity index 52%
rename from test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
rename to test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp
index 969960275f..ab89d9d0f0 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp
@@ -12,6 +12,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
 
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/algorithm.hpp"
@@ -32,7 +33,7 @@ static constexpr auto ConvBwdDataDefault   = ConvBackwardDataSpecialization::Def
 static constexpr auto Filter1x1Stride1Pad0 = ConvBackwardDataSpecialization::Filter1x1Stride1Pad0;
 
 template <typename Tuple, ConvBackwardDataSpecialization ConvSpec>
-class TestGroupedConvndBwdData : public ::testing::Test
+class TestGroupedConvndBwdDataXdl : public ::testing::Test
 {
     protected:
     static constexpr ck::index_t NDimSpatial = 2;
@@ -119,6 +120,100 @@ class TestGroupedConvndBwdData : public ::testing::Test
     }
 };
 
+template <typename Tuple, ConvBackwardDataSpecialization ConvSpec>
+class TestGroupedConvndBwdDataWmma : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 2;
+
+    using OutLayout = std::tuple_element_t<0, Tuple>;
+    using WeiLayout = std::tuple_element_t<1, Tuple>;
+    using InLayout  = std::tuple_element_t<2, Tuple>;
+
+    // clang-format off
+    using GroupedConvBwdDataDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
+            //|    NumDim|        A|         B|          Ds|       E|        AData|        BData|    AccData|          CShuffle|     DsData|       EData|           A|           B|          CDE|       ConvForward| Block|  MPer|  NPer| K0Per| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+            //|   Spatial|   Layout|    Layout|      Layout|  Layout|         Type|         Type|       Type|          DataType|       Type|        Type| Elementwise| Elementwise|  Elementwise|    Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+            //|          |         |          |            |        |             |             |           |                  |           |            |   Operation|   Operation|    Operation|                  |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+            //|          |         |          |            |        |             |             |           |                  |           |            |            |            |             |                  |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+            < NDimSpatial,OutLayout, WeiLayout, ck::Tuple<>, InLayout,       DataType,  DataType, AccDataType,          DataType,  ck::Tuple<>,   DataType,        Pass,        Pass,        Pass,         ConvSpec, 64,    32,    64,     8,  8,    16,   16,       1,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>;
+    // clang-format on
+
+    ck::utils::conv::ConvParam conv_param;
+
+    void SetUp() override
+    {
+        if(!ck::is_gfx11_supported())
+        {
+            GTEST_SKIP();
+        }
+    }
+
+    template <ck::index_t NDimSpatial>
+    bool Run()
+    {
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        std::array<ck::index_t, NDimSpatial + 3> out_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> out_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> wei_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> wei_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> in_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> in_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+        auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+        copy(out_g_n_k_wos_desc.GetLengths(), out_lengths);
+        copy(out_g_n_k_wos_desc.GetStrides(), out_strides);
+        copy(wei_g_k_c_xs_desc.GetLengths(), wei_lengths);
+        copy(wei_g_k_c_xs_desc.GetStrides(), wei_strides);
+        copy(in_g_n_c_wis_desc.GetLengths(), in_lengths);
+        copy(in_g_n_c_wis_desc.GetStrides(), in_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+
+        auto conv = GroupedConvBwdDataDeviceInstance{};
+
+        auto argument = conv.MakeArgument(nullptr,
+                                          nullptr,
+                                          std::array<const void*, 0>{},
+                                          nullptr,
+                                          out_lengths,
+                                          out_strides,
+                                          wei_lengths,
+                                          wei_strides,
+                                          {},
+                                          {},
+                                          in_lengths,
+                                          in_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          Pass{},
+                                          Pass{},
+                                          Pass{});
+        return conv.IsSupportedArgument(argument);
+    }
+};
+
 using GNHWC = ck::tensor_layout::convolution::GNHWC;
 using NHWGC = ck::tensor_layout::convolution::NHWGC;
 
@@ -131,20 +226,35 @@ using KernelTypes =
     ::testing::Types<std::tuple<GNHWK, GKYXC, GNHWC>, std::tuple<NHWGK, GKYXC, NHWGC>>;
 
 template <typename Tuple>
-class TestGroupedConvndBwdDataDefault : public TestGroupedConvndBwdData<Tuple, ConvBwdDataDefault>
+class TestGroupedConvndBwdDataDefaultXdl
+    : public TestGroupedConvndBwdDataXdl<Tuple, ConvBwdDataDefault>
 {
 };
 
 template <typename Tuple>
-class TestGroupedConvndBwdDataFilter1x1
-    : public TestGroupedConvndBwdData<Tuple, Filter1x1Stride1Pad0>
+class TestGroupedConvndBwdDataFilter1x1Xdl
+    : public TestGroupedConvndBwdDataXdl<Tuple, Filter1x1Stride1Pad0>
 {
 };
 
-TYPED_TEST_SUITE(TestGroupedConvndBwdDataDefault, KernelTypes);
-TYPED_TEST_SUITE(TestGroupedConvndBwdDataFilter1x1, KernelTypes);
+template <typename Tuple>
+class TestGroupedConvndBwdDataDefaultWmma
+    : public TestGroupedConvndBwdDataWmma<Tuple, ConvBwdDataDefault>
+{
+};
 
-TYPED_TEST(TestGroupedConvndBwdDataFilter1x1, SpecializationCheck)
+template <typename Tuple>
+class TestGroupedConvndBwdDataFilter1x1Wmma
+    : public TestGroupedConvndBwdDataWmma<Tuple, Filter1x1Stride1Pad0>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataDefaultXdl, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataFilter1x1Xdl, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataDefaultWmma, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataFilter1x1Wmma, KernelTypes);
+
+TYPED_TEST(TestGroupedConvndBwdDataFilter1x1Xdl, SpecializationCheckXdl)
 {
     // Check filter 3,3 instead of 1,1
     this->conv_param  = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
@@ -167,7 +277,30 @@ TYPED_TEST(TestGroupedConvndBwdDataFilter1x1, SpecializationCheck)
     EXPECT_TRUE(is_supported);
 }
 
-TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck)
+TYPED_TEST(TestGroupedConvndBwdDataFilter1x1Wmma, SpecializationCheckWmma)
+{
+    // Check filter 3,3 instead of 1,1
+    this->conv_param  = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Check strides 2,2 instead of 1,1
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Check with pad
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Supported version
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_TRUE(is_supported);
+}
+
+TYPED_TEST(TestGroupedConvndBwdDataDefaultXdl, VectorLoadCheckXdl)
 {
     // vector load for A
     this->conv_param  = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
@@ -179,7 +312,19 @@ TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck)
     EXPECT_FALSE(is_supported);
 }
 
-TYPED_TEST(TestGroupedConvndBwdDataDefault, SplitK)
+TYPED_TEST(TestGroupedConvndBwdDataDefaultWmma, VectorLoadCheckWmma)
+{
+    // vector load for A
+    this->conv_param  = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+    // vector load for B, E, Ds
+    this->conv_param = {2, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+}
+
+TYPED_TEST(TestGroupedConvndBwdDataDefaultXdl, SplitK)
 {
     if(ck::is_xdl_supported())
     {
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp
deleted file mode 100644
index 871c41e706..0000000000
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <tuple>
-#include <vector>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
-
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/algorithm.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using DataType    = ck::half_t;
-using AccDataType = float;
-using Pass        = ck::tensor_operation::element_wise::PassThrough;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using ConvBackwardDataSpecialization =
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization;
-
-static constexpr auto ConvBwdDataDefault   = ConvBackwardDataSpecialization::Default;
-static constexpr auto Filter1x1Stride1Pad0 = ConvBackwardDataSpecialization::Filter1x1Stride1Pad0;
-
-template <typename Tuple, ConvBackwardDataSpecialization ConvSpec>
-class TestGroupedConvndBwdData : public ::testing::Test
-{
-    protected:
-    static constexpr ck::index_t NDimSpatial = 2;
-
-    using OutLayout = std::tuple_element_t<0, Tuple>;
-    using WeiLayout = std::tuple_element_t<1, Tuple>;
-    using InLayout  = std::tuple_element_t<2, Tuple>;
-
-    // clang-format off
-    using GroupedConvBwdDataDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
-            //|    NumDim|        A|         B|          Ds|       E|        AData|        BData|    AccData|          CShuffle|     DsData|       EData|           A|           B|          CDE|       ConvForward| Block|  MPer|  NPer| K0Per| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-            //|   Spatial|   Layout|    Layout|      Layout|  Layout|         Type|         Type|       Type|          DataType|       Type|        Type| Elementwise| Elementwise|  Elementwise|    Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-            //|          |         |          |            |        |             |             |           |                  |           |            |   Operation|   Operation|    Operation|                  |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-            //|          |         |          |            |        |             |             |           |                  |           |            |            |            |             |                  |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-            < NDimSpatial,OutLayout, WeiLayout, ck::Tuple<>, InLayout,       DataType,  DataType, AccDataType,          DataType,  ck::Tuple<>,   DataType,        Pass,        Pass,        Pass,         ConvSpec, 64,    32,    64,     8,  8,    16,   16,       1,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>;
-    // clang-format on
-
-    ck::utils::conv::ConvParam conv_param;
-
-    void SetUp() override
-    {
-        if(!ck::is_gfx11_supported())
-        {
-            GTEST_SKIP();
-        }
-    }
-
-    template <ck::index_t NDimSpatial>
-    bool Run()
-    {
-
-        const auto out_g_n_k_wos_desc =
-            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
-                conv_param);
-
-        const auto wei_g_k_c_xs_desc =
-            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
-                conv_param);
-
-        const auto in_g_n_c_wis_desc =
-            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
-                conv_param);
-
-        std::array<ck::index_t, NDimSpatial + 3> out_lengths{};
-        std::array<ck::index_t, NDimSpatial + 3> out_strides{};
-        std::array<ck::index_t, NDimSpatial + 3> wei_lengths{};
-        std::array<ck::index_t, NDimSpatial + 3> wei_strides{};
-        std::array<ck::index_t, NDimSpatial + 3> in_lengths{};
-        std::array<ck::index_t, NDimSpatial + 3> in_strides{};
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
-        std::array<ck::index_t, NDimSpatial> input_left_pads{};
-        std::array<ck::index_t, NDimSpatial> input_right_pads{};
-
-        auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
-
-        copy(out_g_n_k_wos_desc.GetLengths(), out_lengths);
-        copy(out_g_n_k_wos_desc.GetStrides(), out_strides);
-        copy(wei_g_k_c_xs_desc.GetLengths(), wei_lengths);
-        copy(wei_g_k_c_xs_desc.GetStrides(), wei_strides);
-        copy(in_g_n_c_wis_desc.GetLengths(), in_lengths);
-        copy(in_g_n_c_wis_desc.GetStrides(), in_strides);
-        copy(conv_param.conv_filter_strides_, conv_filter_strides);
-        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
-        copy(conv_param.input_left_pads_, input_left_pads);
-        copy(conv_param.input_right_pads_, input_right_pads);
-
-        auto conv = GroupedConvBwdDataDeviceInstance{};
-
-        auto argument = conv.MakeArgument(nullptr,
-                                          nullptr,
-                                          std::array<const void*, 0>{},
-                                          nullptr,
-                                          out_lengths,
-                                          out_strides,
-                                          wei_lengths,
-                                          wei_strides,
-                                          {},
-                                          {},
-                                          in_lengths,
-                                          in_strides,
-                                          conv_filter_strides,
-                                          conv_filter_dilations,
-                                          input_left_pads,
-                                          input_right_pads,
-                                          Pass{},
-                                          Pass{},
-                                          Pass{});
-        return conv.IsSupportedArgument(argument);
-    }
-};
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using NHWGC = ck::tensor_layout::convolution::NHWGC;
-
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-using NHWGK = ck::tensor_layout::convolution::NHWGK;
-
-using KernelTypes =
-    ::testing::Types<std::tuple<GNHWK, GKYXC, GNHWC>, std::tuple<NHWGK, GKYXC, NHWGC>>;
-
-template <typename Tuple>
-class TestGroupedConvndBwdDataDefault : public TestGroupedConvndBwdData<Tuple, ConvBwdDataDefault>
-{
-};
-
-template <typename Tuple>
-class TestGroupedConvndBwdDataFilter1x1
-    : public TestGroupedConvndBwdData<Tuple, Filter1x1Stride1Pad0>
-{
-};
-
-TYPED_TEST_SUITE(TestGroupedConvndBwdDataDefault, KernelTypes);
-TYPED_TEST_SUITE(TestGroupedConvndBwdDataFilter1x1, KernelTypes);
-
-TYPED_TEST(TestGroupedConvndBwdDataFilter1x1, SpecializationCheck)
-{
-    // Check filter 3,3 instead of 1,1
-    this->conv_param  = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
-    bool is_supported = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
-
-    // Check strides 2,2 instead of 1,1
-    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
-    is_supported     = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
-
-    // Check with pad
-    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}};
-    is_supported     = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
-
-    // Supported version
-    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
-    is_supported     = this->template Run<2>();
-    EXPECT_TRUE(is_supported);
-}
-
-TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck)
-{
-    // vector load for A
-    this->conv_param  = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
-    bool is_supported = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
-    // vector load for B, E, Ds
-    this->conv_param = {2, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
-    is_supported     = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
-}