[CK Tile] Implement Invoker pattern for remaining grouped convolution examples (#2894)

* Invoker for grouped_conv_fwd * Invoker for grouped_conv_bwd_data * Fix incorrect out layout identifier
2026-04-19 22:39:03 +00:00 · 2025-09-24 10:22:38 +02:00
parent 6805684788
commit 15fff74503
6 changed files with 429 additions and 389 deletions
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
@@ -11,199 +11,14 @@

 #include "ck_tile/host.hpp"
 #include "grouped_convolution_utils.hpp"
-
-template <ck_tile::index_t NDimSpatial,
-          typename GemmWarpConfig,
-          typename InDataType,
-          typename WeiDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
-          typename DsDataType     = ck_tile::tuple<>,
-          typename DsLayout       = ck_tile::tuple<>,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
-                            const ck_tile::stream_config& s)
-{
-    constexpr int kBlockPerCu = 1;
-
-    constexpr ck_tile::index_t M_Tile = 64;
-    constexpr ck_tile::index_t N_Tile = 64;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
-
-    constexpr ck_tile::index_t VectorSizeA = 1;
-    constexpr ck_tile::index_t VectorSizeB = 1;
-    constexpr ck_tile::index_t VectorSizeC = 8;
-
-    // Implicit GEMM Traits
-    using CodegenShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
-    using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
-    using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
-                                                              ConvSpec,
-                                                              InLayout,
-                                                              WeiLayout,
-                                                              DsLayout,
-                                                              OutLayout,
-                                                              VectorSizeA,
-                                                              VectorSizeB,
-                                                              VectorSizeC>;
-    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CodegenShape,
-        typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
-        ck_tile::element_wise::PassThrough,
-        ck_tile::element_wise::PassThrough,
-        InDataType,
-        true,
-        GroupedConvTraitsType::VectorSizeA,
-        GroupedConvTraitsType::VectorSizeB>;
-    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using ConvEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<InDataType,
-                                             WeiDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                                             ck_tile::tensor_layout::gemm::RowMajor,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation,
-                                             1,
-                                             true,
-                                             GroupedConvTraitsType::VectorSizeC>>;
-
-        using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
-                                                                     TilePartitioner,
-                                                                     CodegenPipeline,
-                                                                     ConvEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << '\n'
-                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
-                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
-                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    if(args.k_batch == 1)
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
-    }
-}
-
+#include "grouped_convolution_backward_data_invoker.hpp"
 #include "run_grouped_convolution_bwd_data_example.inc"

-template <typename GemmWarpConfig,
-          typename InPrecType,
-          typename WeiPrecType = InPrecType,
-          typename OutPrecType = InPrecType>
-int run_grouped_conv_bwd_data_example_prec_type(
-    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
-{
-    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
-    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
-    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
-
-    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
-    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
-    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
-
-    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
-    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
-    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
-
-    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
-                                                              GemmWarpConfig,
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NWGC{}, GKXC{}, NWGK{});
-    }
-    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
-                                                              GemmWarpConfig,
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
-    }
-    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
-                                                              GemmWarpConfig,
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported memory layout!");
-    }
-}
-
 template <typename GemmWarpConfig>
 int run_grouped_conv_bwd_data_example(int argc, char* argv[])
 {
+    using Invoker = GroupedConvolutionBackwardDataInvoker;
+
    auto [result, arg_parser] = create_args(argc, argv);
    if(!result)
        return -1;
@@ -215,12 +30,16 @@ int run_grouped_conv_bwd_data_example(int argc, char* argv[])

    if(data_type == "fp16")
    {
-        return run_grouped_conv_bwd_data_example_prec_type<GemmWarpConfig, ck_tile::half_t>(
+        return run_grouped_conv_bwd_data_example_prec_type<Invoker,
+                                                           GemmWarpConfig,
+                                                           ck_tile::half_t>(
            in_layout, wei_layout, out_layout, argc, argv);
    }
    else if(data_type == "bf16")
    {
-        return run_grouped_conv_bwd_data_example_prec_type<GemmWarpConfig, ck_tile::bf16_t>(
+        return run_grouped_conv_bwd_data_example_prec_type<Invoker,
+                                                           GemmWarpConfig,
+                                                           ck_tile::bf16_t>(
            in_layout, wei_layout, out_layout, argc, argv);
    }
    else
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionBackwardDataInvoker
+{
+
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
+                                       const ck_tile::stream_config& s)
+    {
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 1;
+        constexpr ck_tile::index_t VectorSizeB = 1;
+        constexpr ck_tile::index_t VectorSizeC = 8;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            InDataType,
+            WeiDataType,
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                InDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                OutDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
+                                                                         TilePartitioner,
+                                                                         CodegenPipeline,
+                                                                         ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(args);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            float ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        if(args.k_batch == 1)
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    }
+};
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -11,190 +11,14 @@

 #include "ck_tile/host.hpp"
 #include "grouped_convolution_utils.hpp"
-
-template <ck_tile::index_t NDimSpatial,
-          typename GemmWarpConfig,
-          typename InDataType,
-          typename WeiDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
-          typename DsDataType     = ck_tile::tuple<>,
-          typename DsLayout       = ck_tile::tuple<>,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_tile::stream_config& s)
-{
-    constexpr int kBlockPerCu = 1;
-
-    constexpr ck_tile::index_t M_Tile = 64;
-    constexpr ck_tile::index_t N_Tile = 64;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
-
-    constexpr ck_tile::index_t VectorSizeA = 8;
-    constexpr ck_tile::index_t VectorSizeB = 8;
-    constexpr ck_tile::index_t VectorSizeC = 8;
-
-    // Implicit GEMM Traits
-    using CodegenShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
-    using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
-    using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
-                                                              ConvSpec,
-                                                              InLayout,
-                                                              WeiLayout,
-                                                              DsLayout,
-                                                              OutLayout,
-                                                              VectorSizeA,
-                                                              VectorSizeB,
-                                                              VectorSizeC>;
-    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CodegenShape,
-        typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd,
-        ck_tile::element_wise::PassThrough,
-        ck_tile::element_wise::PassThrough,
-        InDataType,
-        true,
-        GroupedConvTraitsType::VectorSizeA,
-        GroupedConvTraitsType::VectorSizeB>;
-    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using ConvEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<InDataType,
-                                             WeiDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                                             ck_tile::tensor_layout::gemm::RowMajor,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation,
-                                             1,
-                                             true,
-                                             GroupedConvTraitsType::VectorSizeC>>;
-
-        using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
-                                                                TilePartitioner,
-                                                                CodegenPipeline,
-                                                                ConvEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(kargs);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << '\n'
-                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
-                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
-                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                          ck_tile::memory_operation_enum::set>{});
-}
-
+#include "grouped_convolution_forward_invoker.hpp"
 #include "run_grouped_convolution_fwd_example.inc"

-template <typename GemmWarpConfig,
-          typename InPrecType,
-          typename WeiPrecType = InPrecType,
-          typename OutPrecType = InPrecType>
-int run_grouped_conv_fwd_example_prec_type(
-    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
-{
-    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
-    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
-    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
-
-    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
-    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
-    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
-
-    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
-    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
-    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
-
-    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
-                                                         GemmWarpConfig,
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NWGC{}, GKXC{}, NWGK{});
-    }
-    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
-                                                         GemmWarpConfig,
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
-    }
-    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
-                                                         GemmWarpConfig,
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported memory layout!");
-    }
-}
-
 template <typename GemmWarpConfig>
 int run_grouped_conv_fwd_example(int argc, char* argv[])
 {
+    using Invoker = GroupedConvolutionForwardInvoker;
+
    auto [result, arg_parser] = create_args(argc, argv);
    if(!result)
        return -1;
@@ -206,12 +30,12 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])

    if(data_type == "fp16")
    {
-        return run_grouped_conv_fwd_example_prec_type<GemmWarpConfig, ck_tile::half_t>(
+        return run_grouped_conv_fwd_example_prec_type<Invoker, GemmWarpConfig, ck_tile::half_t>(
            in_layout, wei_layout, out_layout, argc, argv);
    }
    else if(data_type == "bf16")
    {
-        return run_grouped_conv_fwd_example_prec_type<GemmWarpConfig, ck_tile::bf16_t>(
+        return run_grouped_conv_fwd_example_prec_type<Invoker, GemmWarpConfig, ck_tile::bf16_t>(
            in_layout, wei_layout, out_layout, argc, argv);
    }
    else
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionForwardInvoker
+{
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                                  const ck_tile::stream_config& s)
+    {
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 8;
+        constexpr ck_tile::index_t VectorSizeB = 8;
+        constexpr ck_tile::index_t VectorSizeC = 8;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            InDataType,
+            WeiDataType,
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                InDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                OutDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                    TilePartitioner,
+                                                                    CodegenPipeline,
+                                                                    ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(kargs);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            float ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+};
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
@@ -4,6 +4,7 @@

 template <ck_tile::index_t NDimSpatial,
          typename GemmWarpConfig,
+          typename Invoker,
          typename InDataType,
          typename WeiDataType,
          typename AccDataType,
@@ -15,15 +16,15 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
                                   int n_warmup,
                                   int n_repeat)
 {
-    float ave_time = grouped_conv_bwd_data<NDimSpatial,
-                                           GemmWarpConfig,
-                                           InDataType,
-                                           WeiDataType,
-                                           AccDataType,
-                                           OutDataType,
-                                           InLayout,
-                                           WeiLayout,
-                                           OutLayout>(
+    float ave_time = Invoker::template grouped_conv_bwd_data<NDimSpatial,
+                                                             GemmWarpConfig,
+                                                             InDataType,
+                                                             WeiDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             InLayout,
+                                                             WeiLayout,
+                                                             OutLayout>(
        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::size_t flop     = args.GetFlops();
@@ -39,6 +40,7 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,

 template <ck_tile::index_t NDimSpatial,
          typename GemmWarpConfig,
+          typename Invoker,
          typename InDataType,
          typename WeiDataType = InDataType,
          typename OutDataType = InDataType,
@@ -140,6 +142,7 @@ int run_grouped_conv_bwd_data_example_with_layouts(

    invoke_grouped_conv_bwd_data<NDimSpatial,
                                 GemmWarpConfig,
+                                 Invoker,
                                 InDataType,
                                 WeiDataType,
                                 AccDataType,
@@ -188,3 +191,59 @@ int run_grouped_conv_bwd_data_example_with_layouts(

    return pass;
 }
+
+template <typename Invoker,
+          typename GemmWarpConfig,
+          typename InPrecType,
+          typename WeiPrecType = InPrecType,
+          typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_data_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
@@ -4,6 +4,7 @@

 template <ck_tile::index_t NDimSpatial,
          typename GemmWarpConfig,
+          typename Invoker,
          typename InDataType,
          typename WeiDataType,
          typename AccDataType,
@@ -15,15 +16,15 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
                              int n_warmup,
                              int n_repeat)
 {
-    float ave_time = grouped_conv_fwd<NDimSpatial,
-                                      GemmWarpConfig,
-                                      InDataType,
-                                      WeiDataType,
-                                      AccDataType,
-                                      OutDataType,
-                                      InLayout,
-                                      WeiLayout,
-                                      OutLayout>(
+    float ave_time = Invoker::template grouped_conv_fwd<NDimSpatial,
+                                                        GemmWarpConfig,
+                                                        InDataType,
+                                                        WeiDataType,
+                                                        AccDataType,
+                                                        OutDataType,
+                                                        InLayout,
+                                                        WeiLayout,
+                                                        OutLayout>(
        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::size_t flop     = args.GetFlops();
@@ -39,6 +40,7 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,

 template <ck_tile::index_t NDimSpatial,
          typename GemmWarpConfig,
+          typename Invoker,
          typename InDataType,
          typename WeiDataType = InDataType,
          typename OutDataType = InDataType,
@@ -140,6 +142,7 @@ int run_grouped_conv_fwd_example_with_layouts(

    invoke_grouped_conv_fwd<NDimSpatial,
                            GemmWarpConfig,
+                            Invoker,
                            InDataType,
                            WeiDataType,
                            AccDataType,
@@ -188,3 +191,59 @@ int run_grouped_conv_fwd_example_with_layouts(

    return pass;
 }
+
+template <typename Invoker,
+          typename GemmWarpConfig,
+          typename InPrecType,
+          typename WeiPrecType = InPrecType,
+          typename OutPrecType = InPrecType>
+int run_grouped_conv_fwd_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}