Add instances for pipeline v1 and v3

2026-06-08 15:30:23 +00:00 · 2025-08-12 10:18:26 +00:00
parent 70238cab87
commit 45b3d26e3c
1 changed files with 47 additions and 15 deletions
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_wmma_universal_km_kn_mn_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_wmma_universal_km_kn_mn_instance.hpp
@@ -40,11 +40,20 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 template <typename InOutDataType, GemmSpecialization GemmSpec>
 using device_gemm_wmma_universal_km_kn_mn_comp_instances = std::tuple<
    // clang-format off
-        //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|                          BlockwiseGemm|                BlockwiseGemm|
-        //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|                               Pipeline|                     Pipeline|
-        //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |                              Scheduler|                     Verision|
-        //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                                       |                             |
-        DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,   16,   16,       8,       4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
+    //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|                          BlockwiseGemm|                BlockwiseGemm|
+    //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|                               Pipeline|                     Pipeline|
+    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |                              Scheduler|                     Verision|
+    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                                       |                             |
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,   16,   16,       8,       4,    S< 8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,    S< 8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,   16,   16,       4,       2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,   16,   16,       4,       2,    S< 8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,   16,   16,       8,       4,    S< 8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,    S< 8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,   16,   16,       8,       4,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,   16,   16,       8,       4,    S< 8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,    S< 8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,   16,   16,       8,       4,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,   16,   16,       4,       2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,   16,   16,       4,       2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,                     S<1, 16, 1, 16>,               S<4,4,4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
+
    // clang-format on
    >;

@@ -53,13 +62,18 @@ template <typename InOutDataType,
          GemmSpecialization GemmSpec>
 using device_gemm_wmma_universal_km_kn_mn_mem_instances = std::tuple<
    // clang-format off
-        //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|     BlockwiseGemm|                BlockwiseGemm|
-        //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|          Pipeline|                     Pipeline|
-        //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |         Scheduler|                     Verision|
-        //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                  |                             |
-        // Latency friendly
-        DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<2,2,2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
-        // Memory friendly
+    //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|     BlockwiseGemm|                BlockwiseGemm|
+    //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|          Pipeline|                     Pipeline|
+    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |         Scheduler|                     Verision|
+    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                  |                             |
+    // Latency friendly
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<2,2,2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<2,2,2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,    16,    16,    64,   4,   4,   16,   16,       1,       1,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,                     S<1,  8, 1,  4>,               S<4,4,4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   4,   4,   16,   16,       1,       1,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<4,4,4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<4,4,4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    // Memory friendly
+    // TODO: add once v2 is implemented
    // clang-format on
    >;

@@ -73,7 +87,13 @@ using device_gemm_wmma_universal_km_kn_mn_irregular_odd_m_instances = std::tuple
    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |         Scheduler|                     Verision|
    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                  |                             |
    // Latency friendly
-    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<2,2,2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<2,2,2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<2,2,2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,    16,    16,    64,   4,   4,   16,   16,       1,       1,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,                     S<1,  8, 1,  4>,               S<4,4,4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   4,   4,   16,   16,       1,       1,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<4,4,4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<4,4,4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    // Memory friendly
+    // TODO: add once v2 is implemented
    // clang-format on
    >;

@@ -87,7 +107,13 @@ using device_gemm_wmma_universal_km_kn_mn_odd_n_instances = std::tuple<
    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |         Scheduler|                     Verision|
    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                  |                             |
    // Latency friendly
-    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,    16,    16,    64,   4,   4,   16,   16,       1,       1,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  4>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   4,   4,   16,   16,       1,       1,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    // Memory friendly
+    // TODO: add once v2 is implemented
    // clang-format on
    >;

@@ -101,7 +127,13 @@ using device_gemm_wmma_universal_km_kn_mn_irregular_odd_mn_instances = std::tupl
    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |         Scheduler|                     Verision|
    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                  |                             |
    // Latency friendly
-    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   4,   4,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    32,    16,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    32,    16,    16,    64,   4,   4,   16,   16,       1,       1,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,    S< 8,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  4>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   4,   4,   16,   16,       1,       1,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,    S< 8,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,    32,    64,   2,   2,   16,   16,       1,       1,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,         0,    S<16,  4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,         0,           1,           1,                     S<1,  8, 1,  8>,               S<1,1,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
+    // Memory friendly
+    // TODO: add once v2 is implemented
    // clang-format on
    >;