From 044bcfcb1e47993c44c10d172bf75b5254273eea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <ville.pietila@amd.com>
Date: Thu, 16 Oct 2025 11:03:14 +0000
Subject: [PATCH] Take universal GEMM pipeline into use for grouped
 convolutions.

---
 ...ped_convolution_backward_weight_kernel.hpp |   2 +-
 .../gpu/gemm_configs.hpp                      |  91 ++++++
 ...grouped_conv_bwd_weight_bf16_instances.hpp | 162 +----------
 ...grouped_conv_bwd_weight_fp16_instances.hpp |  23 +-
 .../tile_grouped_conv_bwd_weight_invoker.hpp  | 271 ++++++++++--------
 .../tile_grouped_conv_instance_factory.hpp    |   5 +
 6 files changed, 278 insertions(+), 276 deletions(-)
 create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 7906aa3389..874a23d670 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -466,7 +466,7 @@ struct GroupedConvolutionBackwardWeightKernel
             {
                 if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                 {
-                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+                    CK_TILE_ERROR("Conditions not met for Kbatch > 1!");
                 }
                 return false;
             }
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp
new file mode 100644
index 0000000000..509486569b
--- /dev/null
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <variant>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+#define CK_TILE_PIPELINE_COMPUTE_V5 4
+
+namespace ck_tile {
+namespace ops {
+
+using MemoryOpSet = std::integral_constant<ck_tile::memory_operation_enum, 
+                                                ck_tile::memory_operation_enum::set>;
+
+using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
+                                        ck_tile::memory_operation_enum::atomic_add>;
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = true;
+    static constexpr bool kPadN = true;
+    static constexpr bool kPadK = true;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+    static constexpr bool TiledMMAPermuteN          = false;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
+};
+
+} // namespace ops
+} // namespace ck_tile
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
index bd081f569a..7428393ab6 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
@@ -17,155 +17,21 @@ template <ck_tile::index_t NDimSpatial,
           typename BLayout,
           typename ELayout>
 using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple<
-    // clang-format off
-        //#####################################|       Num|  InLayout| WeiLayout| OutLayout| InData| WeiData| OutData|           In|         Wei|         Out|  K-block|  M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector|
-        //#####################################|       Dim|          |          |          |   Type|    Type|    Type|  Elementwise| Elementwise| Elementwise|      per|        |        |        |       |       |       |   tile|   tile|   tile|   size|   size|   size|
-        //#####################################|   Spatial|          |          |          |       |        |        |    Operation|   Operation|   Operation|       CU|        |        |        |       |       |       |   size|   size|   size|      A|      B|      C|
-        //#####################################|          |          |          |          |       |        |        |             |            |            |                    
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     8,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     8,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     8,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     8,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      4,     8,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     1,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     2,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       4>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     4,       8>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     8,       1>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     8,       2>,
-        // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      8,     8,       4>,
+// clang-format off
+    //#####################################|       Num|  InLayout| WeiLayout| OutLayout| InData| WeiData| OutData|           In|         Wei|         Out|  K-block|  M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector|  Double|                         GEMM|
+    //#####################################|       Dim|          |          |          |   Type|    Type|    Type|  Elementwise| Elementwise| Elementwise|      per|        |        |        |       |       |       |   tile|   tile|   tile|   size|   size|   size|    smem|                     pipeline|
+    //#####################################|   Spatial|          |          |          |       |        |        |    Operation|   Operation|   Operation|       CU|        |        |        |       |       |       |   size|   size|   size|      A|      B|      C|  buffer|                      version|
+    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,      8,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>
+    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,       8,     128,      64,      2,      2,      1,      4,     64,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,     128,       8,      64,      2,      2,      1,     64,      4,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
 
-        //#####################################|       Num|  InLayout| WeiLayout| OutLayout| InData| WeiData| OutData|           In|         Wei|         Out|  K-block|  M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector|
-        //#####################################|       Dim|          |          |          |   Type|    Type|    Type|  Elementwise| Elementwise| Elementwise|      per|        |        |        |       |       |       |   tile|   tile|   tile|   size|   size|   size|
-        //#####################################|   Spatial|          |          |          |       |        |        |    Operation|   Operation|   Operation|       CU|        |        |        |       |       |       |   size|   size|   size|      A|      B|      C|
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,      8,      2,     4,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      2,     4,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,       8,     128,      64,      2,      2,      1,      4,     64,     16,      2,     4,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,     128,       8,      64,      2,      2,      1,     64,      4,     16,      2,     4,       2>,
-
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,      8,      2,     2,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,     16,      2,     2,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     16,     16,     32,      2,     2,       2>
-    // // clang-format on
-    // // clang-format on
-    // clang-format on
-    >;
+    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,      8,      2,     2,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,     16,      2,     2,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     16,     16,     32,      2,     2,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>
+// // clang-format on
+>;
 
 } // namespace ops
 } // namespace ck_tile
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_fp16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_fp16_instances.hpp
index 49c73eda95..e88b65e3e5 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_fp16_instances.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_fp16_instances.hpp
@@ -17,21 +17,14 @@ template <ck_tile::index_t NDimSpatial,
           typename BLayout,
           typename ELayout>
 using tile_grouped_conv_bwd_weight_f16_instances = std::tuple<
-    // clang-format off
-        //#####################################|       Num|  InLayout| WeiLayout| OutLayout| InData| WeiData| OutData|           In|         Wei|         Out|  K-block|  M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector|
-        //#####################################|       Dim|          |          |          |   Type|    Type|    Type|  Elementwise| Elementwise| Elementwise|      per|        |        |        |       |       |       |   tile|   tile|   tile|   size|   size|   size|
-        //#####################################|   Spatial|          |          |          |       |        |        |    Operation|   Operation|   Operation|       CU|        |        |        |       |       |       |   size|   size|   size|      A|      B|      C|
-        //#####################################|          |          |          |          |       |        |        |             |            |            |                    
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       8>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       4>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       8>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       8>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       2>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       4>,
-        GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      1,     1,       8>
-    // clang-format on
-    >;
+// clang-format off
+    //#####################################|       Num|  InLayout| WeiLayout| OutLayout| InData| WeiData| OutData|           In|         Wei|         Out|  K-block|  M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector|    Double|                         GEMM|
+    //#####################################|       Dim|          |          |          |   Type|    Type|    Type|  Elementwise| Elementwise| Elementwise|      per|        |        |        |       |       |       |   tile|   tile|   tile|   size|   size|   size|      smem|                     pipeline|
+    //#####################################|   Spatial|          |          |          |       |        |        |    Operation|   Operation|   Operation|       CU|        |        |        |       |       |       |   size|   size|   size|      A|      B|      C|    buffer|                      version|            |                    
+    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,    F16,     F16,     F16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,      8,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>
+// clang-format on
+>;
 
 } // namespace ops
 } // namespace ck_tile
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
index 918ad2271b..ca32e9f0c7 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
@@ -12,6 +12,7 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/grouped_convolution.hpp" 
+#include "ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp"
 
 namespace ck_tile {
 namespace ops {
@@ -31,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
 struct GroupedConvolutionBackwardWeightBaseInvoker
 {
     virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; 
-    virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0;
+    virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const = 0;
     virtual std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
     GroupedConvolutionBackwardWeightBaseInvoker() = default;
     GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
@@ -64,7 +65,9 @@ template <
     ck_tile::index_t K_Warp_Tile,
     ck_tile::index_t VectorSizeA,
     ck_tile::index_t VectorSizeB,
-    ck_tile::index_t VectorSizeC>
+    ck_tile::index_t VectorSizeC,
+    bool DoubleSmemBuffer,
+    ck_tile::index_t PipelineVersion>
 struct GroupedConvolutionBackwardWeightInvoker : 
     public GroupedConvolutionBackwardWeightBaseInvoker<NDimSpatial,
                                                         InLayout,
@@ -77,144 +80,188 @@ struct GroupedConvolutionBackwardWeightInvoker :
                                                         WeiElementwiseOperation,
                                                         OutElementwiseOperation>
 {
-    using CodegenShape_ =
-            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+            ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+            ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>,
+            GemmConfigBase::PermuteA,
+            GemmConfigBase::PermuteB>;
 
-    static constexpr auto ConvSpec_  = ck_tile::ConvolutionSpecialization::Default;
+    static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
 
-    using TilePartitioner_           = ck_tile::GemmTile1DPartitioner<CodegenShape_>;
-    using GroupedConvTraitsType_     = ck_tile::GroupedConvTraits<NDimSpatial,
-                                                                ConvSpec_,
-                                                                InLayout,
-                                                                WeiLayout,
-                                                                ck_tile::tuple<>, // = DsLayout
-                                                                OutLayout,
-                                                                VectorSizeA,
-                                                                VectorSizeB,
-                                                                VectorSizeC>;
+    using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfigBase::TileParitionerGroupNum,
+                                                       GemmConfigBase::TileParitionerM01>;
+
+    using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                 ConvSpec,
+                                                                 InLayout,
+                                                                 WeiLayout,
+                                                                 ck_tile::tuple<>, // = DsLayout
+                                                                 OutLayout,
+                                                                 VectorSizeA,
+                                                                 VectorSizeB,
+                                                                 VectorSizeC>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
+            GemmConfigBase::kPadM,
+            GemmConfigBase::kPadN,
+            GemmConfigBase::kPadK,
+            DoubleSmemBuffer,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::AsLayout,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::BsLayout,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::CLayout,
+            GemmConfigBase::TransposeC,
+            GemmConfigBase::UseStructuredSparsity,
+            false, // Persistent,
+            GemmConfigBase::NumWaveGroups>;
 
     using AccDataType = float;
-    using CDEElementWise = ck_tile::element_wise::PassThrough;
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
+            OutDataType,
+            InDataType,
+            AccDataType,
+            GemmShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            WeiDataType,
+            true,
+            VectorSizeA,
+            VectorSizeB>;
 
-    using CodegenPipelineProblem_ = ck_tile::GemmPipelineProblem<
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CodegenShape_,
-        typename GroupedConvTraitsType_::GroupedConvImplicitGemmTraitsBwdWeight,
-        ck_tile::element_wise::PassThrough,
-        ck_tile::element_wise::PassThrough,
-        InDataType,
-        true,
-        GroupedConvTraitsType_::VectorSizeA,
-        GroupedConvTraitsType_::VectorSizeB>;
+    using BaseGemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template UniversalGemmPipeline<GemmPipelineProblem>;
+    
+    template <bool HasHotLoop, ck_tile::TailNumber TailNumber, ck_tile::memory_operation_enum MemOp>
+    auto CreateKernel() const
+    {
+        constexpr auto scheduler = GemmConfigBase::Scheduler;
+    
+        using UniversalGemmProblem =
+            ck_tile::UniversalGemmPipelineProblem<OutDataType,
+                                                InDataType,
+                                                AccDataType,
+                                                GemmShape,
+                                                GemmUniversalTraits,
+                                                scheduler,
+                                                HasHotLoop,
+                                                TailNumber,
+                                                ck_tile::element_wise::PassThrough,
+                                                ck_tile::element_wise::PassThrough,
+                                                WeiDataType,
+                                                true,
+                                                VectorSizeA,
+                                                VectorSizeB>;
 
-    using CodegenPipeline_ = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem_>;
+        using GemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template GemmPipeline<UniversalGemmProblem>;
 
-    using ConvEpilogueAtomicAdd_ = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-                InDataType,
-                WeiDataType,
-                ck_tile::tuple<>, // = DsDataType,
-                AccDataType,
-                OutDataType,
-                typename GroupedConvTraitsType_::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
-                CDEElementWise,
-                TilePartitioner_::MPerBlock,
-                TilePartitioner_::NPerBlock,
-                M_Warp,
-                N_Warp,
-                M_Warp_Tile,
-                N_Warp_Tile,
-                K_Warp_Tile,
-                CodegenPipelineProblem_::TransposeC,
-                ck_tile::memory_operation_enum::atomic_add,
-                1,
-                true,
-                GroupedConvTraitsType_::VectorSizeC>>;
+        using CDEElementWise = ck_tile::element_wise::PassThrough;
 
-    using ConvEpilogueSet_ = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-                InDataType,
-                WeiDataType,
-                ck_tile::tuple<>, // = DsDataType,
-                AccDataType,
-                OutDataType,
-                typename GroupedConvTraitsType_::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
-                CDEElementWise,
-                TilePartitioner_::MPerBlock,
-                TilePartitioner_::NPerBlock,
-                M_Warp,
-                N_Warp,
-                M_Warp_Tile,
-                N_Warp_Tile,
-                K_Warp_Tile,
-                CodegenPipelineProblem_::TransposeC,
-                ck_tile::memory_operation_enum::set,
-                1,
-                true,
-                GroupedConvTraitsType_::VectorSizeC>>;
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+            OutDataType,
+            InDataType,
+            ck_tile::tuple<>, // = DsDataType
+            AccDataType,
+            WeiDataType,
+            typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+            ck_tile::tensor_layout::gemm::RowMajor,
+            CDEElementWise,
+            TilePartitioner::MPerBlock,
+            TilePartitioner::NPerBlock,
+            M_Warp,
+            N_Warp,
+            M_Warp_Tile,
+            N_Warp_Tile,
+            K_Warp_Tile,
+            GemmConfigBase::TransposeC,
+            MemOp,
+            1,
+            true,
+            GroupedConvTraitsType::VectorSizeC>>;
 
-    using KernelSplitK = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType_,
-                                                                           TilePartitioner_,
-                                                                           CodegenPipeline_,
-                                                                           ConvEpilogueAtomicAdd_>;
-
-    using KernelNonSplitK = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType_,
-                                                                           TilePartitioner_,
-                                                                           CodegenPipeline_,
-                                                                           ConvEpilogueSet_>;
+        return ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
+                                                            TilePartitioner,
+                                                            GemmPipeline,
+                                                            ConvEpilogue>{};   
+    }
 
     bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const override
     {
-        if (args.k_batch == 1)
+        if (args.k_batch > 1)
         {
-            return KernelNonSplitK::IsSupportedArgument(KernelNonSplitK::MakeKernelArgs(args));
+            using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::atomic_add>());
+            return Kernel::IsSupportedArgument(args);    
         }
-        return KernelSplitK::IsSupportedArgument(KernelSplitK::MakeKernelArgs(args));
+        using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::set>());
+        return Kernel::IsSupportedArgument(args);
     };
 
-    template <typename Kernel>
-    float RunImpl(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel)
+    float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const override
     {
-        auto kargs        = Kernel::MakeKernelArgs(args);
-        const dim3 grids  = Kernel::GridSize(kargs);
-        const dim3 blocks = Kernel::BlockSize();
+        const ck_tile::index_t gemm_k =
+            args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
+                                      args.output_spatial_lengths_.end(),
+                                      1,
+                                      std::multiplies<ck_tile::index_t>());
 
-        constexpr int n_warmup = 5;
-        constexpr int n_repeat = 50;
-        ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
-        float avg_time = ck_tile::launch_kernel_time_mask(
-            s,
-            Kernel::Preprocess(kargs, s),
-            ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+        const ck_tile::index_t K_split     = (gemm_k + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+        float ave_time{0};
 
-        return avg_time;
-    };
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto memory_operation = memory_operation_.value;
 
-    float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) override
-    {
-        if (args.k_batch == 1)
-        {
-            return RunImpl<KernelNonSplitK>(args, time_kernel);
-        }
-        else
-        {
-            return RunImpl<KernelSplitK>(args, time_kernel);
-        }
+            auto kernel = CreateKernel<has_hot_loop_v, tail_number_v, memory_operation>();
+            using Kernel = decltype(kernel);
+                             
+            auto kargs   = Kernel::MakeKernelArgs(args);
+            const dim3 grids  = Kernel::GridSize(args);
+            const dim3 blocks = Kernel::BlockSize();
+                   
+            constexpr int n_warmup = 5;
+            constexpr int n_repeat = 50;
+            ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(kernel, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_, tail_number_, MemoryOpSet{});
+            }
+            else
+            {
+                Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+        return ave_time;
     };
 
     std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const override
     {
         std::stringstream min_occupancy;
         min_occupancy << "_blk_per_cu_" << kBlockPerCu;
-        if (args.k_batch == 1)
+        if (args.k_batch > 1)
         {
-            return KernelNonSplitK::GetName() + min_occupancy.str();
+            using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::atomic_add>());
+            return Kernel::GetName() + min_occupancy.str();    
         }
-        return KernelSplitK::GetName() + min_occupancy.str();
+        using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::set>());
+        return Kernel::GetName() + min_occupancy.str();
     };
 
     GroupedConvolutionBackwardWeightInvoker() = default;
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp
index 6ec251b052..e4bbc9d8cc 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp
@@ -10,6 +10,11 @@
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+#define CK_TILE_PIPELINE_COMPUTE_V5 4
+
 namespace ck_tile {
 namespace ops {