From 5a11d4edc7f0a9b95db146e00c2fa76ea2af844a Mon Sep 17 00:00:00 2001
From: Anthony Chang <ac.chang@outlook.com>
Date: Wed, 7 Sep 2022 02:38:01 +0800
Subject: [PATCH] GemmGemm TNNT instances (#399)

* add gemm_gemm TNNT instance

* sanitize Gemm1KPack

* disable instances that failed validation on mi100

[ROCm/composable_kernel commit: fe52c94c9814b0ade7b461706c246b7cf9812f19]
---
 ...wise_batched_gemm_gemm_xdl_cshuffle_v1.hpp |  3 +-
 ...ched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp |  3 +-
 .../gpu/batched_gemm_gemm.hpp                 | 20 +++++
 .../gpu/batched_gemm_gemm/CMakeLists.txt      |  1 +
 ...6_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp | 80 +++++++++++++++++++
 .../test_batched_gemm_gemm_fp16.cpp           |  3 +-
 6 files changed, 107 insertions(+), 3 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
index 88f0c0a30b..81b85ab67e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -602,8 +602,9 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
             static_cast<FloatAB*>(p_shared) + SharedMemTrait::b1_block_space_offset,
             b1_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
+        // selected_mfma.k_per_blk <= B1K1 <= selected_mfma.group_size
         constexpr index_t Gemm1KPack = math::max(
-            math::lcm(MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size, B1K1),
+            math::gcd(MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size, B1K1),
             MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
 
         auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2<
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
index acb2839d3c..e21705bff7 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -608,8 +608,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
             static_cast<FloatAB*>(p_shared) + SharedMemTrait::b1_block_space_offset,
             b1_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
+        // selected_mfma.k_per_blk <= B1K1 <= selected_mfma.group_size
         constexpr index_t Gemm1KPack = math::max(
-            math::lcm(MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size, B1K1),
+            math::gcd(MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size, B1K1),
             MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
 
         auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2<
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
index 8f6eaf07da..a6dcfa30d3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -32,6 +32,20 @@ void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_i
                                                       PassThrough,
                                                       PassThrough>>>& instances);
 
+void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
 template <typename ALayout,
           typename B0Layout,
           typename B1Layout,
@@ -82,6 +96,12 @@ struct DeviceOperationInstanceFactory<
                 add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
                     op_ptrs);
             }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                              is_same_v<B1Layout, Col> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+                    op_ptrs);
+            }
         }
         return op_ptrs;
     }
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
index e0968a99ac..865a31e79a 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_instance_library(device_batched_gemm_gemm_instance
     device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
new file mode 100644
index 0000000000..973e4cfa93
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |
+        // DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    4,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,  // TODO: to enable; can trigger compiler crash in mainline #9110 but not in #10738
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    4,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        // DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    4,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,  // TODO: to enable; can cause validation error on MI100
+        // DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,   128,    32,   8,   8,    4,   32,   32,     1,     8,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,  // TODO: to enable; can cause validation error on MI100
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,    64,    32,   8,   8,    4,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,    64,    32,   8,   8,    4,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    4,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    4,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    4,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           8,               S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    4,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           4,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    4,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           8,               S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    4,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           4,               S<1, 32, 1, 8>,               8>,
+        // Padded fallback kernel
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   128,    64,   128,    32,   8,   8,    4,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,    64,    32,   128,    32,   8,   8,    4,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
index f9c74dfbb3..f3e12a9123 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
@@ -11,7 +11,8 @@ class TestBatchedGemmGemmFP16 : public TestBatchedGemmGemm<Tuple>
 
 // clang-format off
 using KernelTypes = ::testing::Types<
-    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>,
+    std::tuple<F16, F16, F16, F16, Row, Col, Col, Row>
     >;
 // clang-format on