[CK TILE] Grouped Convolution Forward Kernel (#2188)

* [CK TILE] Grouped Convolution Forward Kernel * custom vector size * fixes * refactor * rebase fixes * fixes * fixes [ROCm/composable_kernel commit: cebdee4d9e]
2026-05-14 02:02:46 +00:00 · 2025-06-21 00:44:36 +02:00
parent 21db8f9138
commit 2567f5e538
17 changed files with 3096 additions and 19 deletions
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
+set(EXAMPLE_CONV_COMPILE_OPTIONS)
+list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsDataType     = ck_tile::tuple<>,
+          typename DsLayout       = ck_tile::tuple<>,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr int kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t M_Tile = 64;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr ck_tile::index_t VectorSizeA = 8;
+    constexpr ck_tile::index_t VectorSizeB = 8;
+    constexpr ck_tile::index_t VectorSizeC = 8;
+
+    // Implicit GEMM Traits
+    using CodegenShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
+    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<InDataType,
+                                     WeiDataType,
+                                     AccDataType,
+                                     CodegenShape,
+                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
+                                     InDataType,
+                                     true,
+                                     VectorSizeA,
+                                     VectorSizeB>;
+    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<InDataType,
+                                             WeiDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                             ck_tile::tensor_layout::gemm::RowMajor,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             1,
+                                             true,
+                                             VectorSizeC>>;
+
+        using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                TilePartitioner,
+                                                                CodegenPipeline,
+                                                                ConvEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << '\n'
+                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                          ck_tile::memory_operation_enum::set>{});
+}
+
+#include "run_grouped_convolution_example.inc"
+
+template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
+int run_grouped_conv_fwd_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
+
+int run_grouped_conv_fwd_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("weight_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_fwd_example_prec_type<ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_fwd_example_prec_type<ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -0,0 +1,108 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+
+ck_tile::index_t fill_spatial_dimensions(std::vector<ck_tile::index_t>& filter_spatial_lengths,
+                                         std::vector<ck_tile::index_t>& image_spatial_lengths,
+                                         std::vector<ck_tile::index_t>& strides,
+                                         std::vector<ck_tile::index_t>& dilations,
+                                         std::vector<ck_tile::index_t>& lpads,
+                                         std::vector<ck_tile::index_t>& rpads,
+                                         ck_tile::ArgParser& arg_parser)
+{
+
+    constexpr ck_tile::index_t non_sp_dims = 3;
+    const ck_tile::index_t n_dim_sp        = arg_parser.get_str("in_layout").size() - non_sp_dims;
+
+    if(!(n_dim_sp >= 1 && n_dim_sp <= 3))
+    {
+        throw std::runtime_error("Wrong layout!\n");
+    }
+
+    if(n_dim_sp == 3)
+    {
+        filter_spatial_lengths.push_back(arg_parser.get_int("z"));
+        image_spatial_lengths.push_back(arg_parser.get_int("d"));
+        strides.push_back(arg_parser.get_int("stride_d"));
+        dilations.push_back(arg_parser.get_int("dilation_d"));
+        lpads.push_back(arg_parser.get_int("lpad_d"));
+        rpads.push_back(arg_parser.get_int("rpad_d"));
+    }
+    if(n_dim_sp >= 2)
+    {
+        filter_spatial_lengths.push_back(arg_parser.get_int("y"));
+        image_spatial_lengths.push_back(arg_parser.get_int("h"));
+        strides.push_back(arg_parser.get_int("stride_h"));
+        dilations.push_back(arg_parser.get_int("dilation_h"));
+        lpads.push_back(arg_parser.get_int("lpad_h"));
+        rpads.push_back(arg_parser.get_int("rpad_h"));
+    }
+    filter_spatial_lengths.push_back(arg_parser.get_int("x"));
+    image_spatial_lengths.push_back(arg_parser.get_int("w"));
+    strides.push_back(arg_parser.get_int("stride_w"));
+    dilations.push_back(arg_parser.get_int("dilation_w"));
+    lpads.push_back(arg_parser.get_int("lpad_w"));
+    rpads.push_back(arg_parser.get_int("rpad_w"));
+
+    return n_dim_sp;
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("g", "2", "group dimension")
+        .insert("n", "32", "n dimension")
+        .insert("k", "32", "k dimension")
+        .insert("c", "32", "c dimension")
+
+        .insert("d", "64", "d dimension")
+        .insert("h", "64", "h dimension")
+        .insert("w", "64", "w dimension")
+
+        .insert("z", "4", "z dimension")
+        .insert("y", "4", "y dimension")
+        .insert("x", "4", "x dimension")
+
+        .insert("stride_d", "1", "d stride")
+        .insert("stride_h", "1", "h stride")
+        .insert("stride_w", "1", "w stride")
+
+        .insert("dilation_d", "1", "d dilation")
+        .insert("dilation_h", "1", "h dilation")
+        .insert("dilation_w", "1", "w dilation")
+
+        .insert("lpad_d", "0", "left pad for d dimension")
+        .insert("lpad_h", "0", "left pad for h dimension")
+        .insert("lpad_w", "0", "left pad for w dimension")
+
+        .insert("rpad_d", "0", "right pad for d dimension")
+        .insert("rpad_h", "0", "right pad for h dimension")
+        .insert("rpad_w", "0", "right pad for w dimension")
+
+        .insert("in_layout", "NHWGC", "Input image layout - NHWGC by default")
+        .insert("weight_layout", "GKYXC", "Weight layout - GKYXC by default")
+        .insert("out_layout", "NHWGK", "Output image layout - NHWGK by default")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
+auto calculate_rtol_atol(const ck_tile::index_t GemmK,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(GemmK, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
+    const auto atol_split_k =
+        ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
+            max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+float invoke_grouped_conv_fwd(ck_tile::GroupedConvHostArgs& args, int n_warmup, int n_repeat)
+{
+    float ave_time = grouped_conv_fwd<NDimSpatial,
+                                      InDataType,
+                                      WeiDataType,
+                                      AccDataType,
+                                      OutDataType,
+                                      InLayout,
+                                      WeiLayout,
+                                      OutLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+int run_grouped_conv_fwd_example_with_layouts(
+    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = float;
+
+    std::vector<ck_tile::index_t> filter_spatial_lengths;
+    std::vector<ck_tile::index_t> image_spatial_lengths;
+    std::vector<ck_tile::index_t> strides;
+    std::vector<ck_tile::index_t> dilations;
+    std::vector<ck_tile::index_t> lpads;
+    std::vector<ck_tile::index_t> rpads;
+
+    const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
+                                                                image_spatial_lengths,
+                                                                strides,
+                                                                dilations,
+                                                                lpads,
+                                                                rpads,
+                                                                arg_parser);
+
+    ck_tile::conv::ConvParam conv_param{num_dim_sp,
+                                        arg_parser.get_int("g"),
+                                        arg_parser.get_int("n"),
+                                        arg_parser.get_int("k"),
+                                        arg_parser.get_int("c"),
+                                        filter_spatial_lengths,
+                                        image_spatial_lengths,
+                                        strides,
+                                        dilations,
+                                        lpads,
+                                        rpads};
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    const auto in_g_n_c_wis_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(input);
+        ck_tile::FillUniformDistribution<WeiDataType>{-5.f, 5.f}(weight);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<InDataType>{}(input);
+        ck_tile::FillMonotonicSeq<WeiDataType>{}(weight);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<WeiDataType>{1.f, 1.f}(weight);
+    }
+    else
+    {
+        input.SetZero();
+        weight.SetZero();
+    }
+
+    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+    input_dev_buf.ToDevice(input.data());
+    weight_dev_buf.ToDevice(weight.data());
+    output_dev_buf.SetZero();
+
+    ck_tile::GroupedConvHostArgs args(conv_param,
+                                      input_dev_buf.GetDeviceBuffer(),
+                                      weight_dev_buf.GetDeviceBuffer(),
+                                      {},
+                                      output_dev_buf.GetDeviceBuffer(),
+                                      kbatch);
+
+    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    invoke_grouped_conv_fwd<NDimSpatial,
+                            InDataType,
+                            WeiDataType,
+                            AccDataType,
+                            OutDataType,
+                            InLayout,
+                            WeiLayout,
+                            OutLayout>(args, n_warmup, n_repeat);
+
+    output_dev_buf.FromDevice(output.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
+        output_host_ref.SetZero();
+
+        ck_tile::reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
+            input,
+            weight,
+            output_host_ref,
+            conv_param.conv_filter_strides_,
+            conv_param.conv_filter_dilations_,
+            conv_param.input_left_pads_,
+            conv_param.input_right_pads_);
+        const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const float max_accumulated_value =
+            *std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
+                GemmK, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(output,
+                                  output_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        throw std::runtime_error("Unsupported gpu verification !!!");
+    }
+
+    return pass;
+}
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -19,6 +19,7 @@ add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
 add_subdirectory(18_flatmm)
 add_subdirectory(19_gemm_multi_d)
+add_subdirectory(20_grouped_convolution)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(36_copy)
 add_subdirectory(37_transpose)
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -27,6 +27,7 @@
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "ck_tile/host/reference/reference_fused_moe.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
+#include "ck_tile/host/reference/reference_grouped_conv_fwd.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
 #include "ck_tile/host/reference/reference_moe_sorting.hpp"
--- a/include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+CK_TILE_HOST void reference_grouped_conv_fwd(const HostTensor<InDataType>& input,
+                                             const HostTensor<WeiDataType>& weight,
+                                             HostTensor<OutDataType>& output,
+                                             std::vector<ck_tile::long_index_t> conv_strides,
+                                             std::vector<ck_tile::long_index_t> conv_dilations,
+                                             std::vector<ck_tile::long_index_t> in_left_pads,
+                                             std::vector<ck_tile::long_index_t>)
+{
+    if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
+         weight.get_num_of_dimension() == NDimSpatial + 3 &&
+         output.get_num_of_dimension() == NDimSpatial + 3))
+    {
+        throw std::runtime_error("wrong! inconsistent dimension");
+    }
+
+    if constexpr(NDimSpatial == 1)
+    {
+        auto func = [&](auto g, auto n, auto k, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t x = 0; x < weight.get_lengths()[3]; ++x)
+                {
+                    auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
+                    {
+                        InDataType v_in   = input(g, n, c, wi);
+                        WeiDataType v_wei = weight(g, k, c, x);
+                        v_acc += ck_tile::type_convert<float>(v_in) *
+                                 ck_tile::type_convert<float>(v_wei);
+                    }
+                }
+            }
+            OutDataType v_acc_converted = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, wo)         = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        auto func = [&](auto g, auto n, auto k, auto ho, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t y = 0; y < weight.get_lengths()[3]; ++y)
+                {
+                    auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    for(std::size_t x = 0; x < weight.get_lengths()[4]; ++x)
+                    {
+                        auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+
+                        if(hi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
+                           wi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
+                        {
+                            InDataType v_in   = input(g, n, c, hi, wi);
+                            WeiDataType v_wei = weight(g, k, c, y, x);
+
+                            v_acc += ck_tile::type_convert<float>(v_in) *
+                                     ck_tile::type_convert<float>(v_wei);
+                        }
+                    }
+                }
+            }
+            OutDataType v_acc_converted = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, ho, wo)     = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3],
+                                   output.get_lengths()[4])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        auto func = [&](auto g, auto n, auto k, auto d_o, auto ho, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t z = 0; z < weight.get_lengths()[3]; ++z)
+                {
+                    auto di = static_cast<ck_tile::long_index_t>(d_o * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+                    for(std::size_t y = 0; y < weight.get_lengths()[4]; ++y)
+                    {
+                        auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+                        for(std::size_t x = 0; x < weight.get_lengths()[5]; ++x)
+                        {
+                            auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
+                                      static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
+                                      static_cast<ck_tile::long_index_t>(in_left_pads[2]);
+                            if(di >= 0 &&
+                               ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
+                               hi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
+                               wi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
+                            {
+                                InDataType v_in   = input(g, n, c, di, hi, wi);
+                                WeiDataType v_wei = weight(g, k, c, z, y, x);
+
+                                v_acc += ck_tile::type_convert<float>(v_in) *
+                                         ck_tile::type_convert<float>(v_wei);
+                            }
+                        }
+                    }
+                }
+            }
+            OutDataType v_acc_converted  = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, d_o, ho, wo) = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3],
+                                   output.get_lengths()[4],
+                                   output.get_lengths()[5])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("Ref_Conv_fwd: number of dimensions must be between 1 and 3.");
+    }
+}
+} // namespace ck_tile
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -27,7 +27,9 @@ template <typename ADataType_,
          index_t KPerXdl_,
          bool isCTransposed_,
          memory_operation_enum MemoryOperation_,
-          index_t kNumWaveGroups_ = 1>
+          index_t kNumWaveGroups_ = 1,
+          bool FixedVectorSize_   = false,
+          index_t VectorSizeC_    = 1>
 struct CShuffleEpilogueProblem
 {
    using ADataType                                        = remove_cvref_t<ADataType_>;
@@ -48,6 +50,8 @@ struct CShuffleEpilogueProblem
    static constexpr index_t KPerXdl                       = KPerXdl_;
    static constexpr index_t isCTransposed                 = isCTransposed_;
    static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
+    static constexpr bool FixedVectorSize                  = FixedVectorSize_;
+    static constexpr index_t VectorSizeC                   = VectorSizeC_;
    static constexpr index_t kNumWaveGroups                = kNumWaveGroups_;
    static constexpr index_t NumDTensor                    = DsDataType::size();

@@ -80,6 +84,8 @@ struct CShuffleEpilogue
    static constexpr index_t NPerXdl                       = Problem::NPerXdl;
    static constexpr index_t KPerXdl                       = Problem::KPerXdl;
    static constexpr index_t isCTransposed                 = Problem::isCTransposed;
+    static constexpr bool FixedVectorSize                  = Problem::FixedVectorSize;
+    static constexpr index_t VectorSizeC                   = Problem::VectorSizeC;
    static constexpr index_t MPerIteration                 = MPerXdl * MWave;
    static constexpr index_t NPerIteration                 = NPerXdl * NWave;
    static constexpr index_t NumDTensor                    = Problem::NumDTensor;
@@ -98,6 +104,10 @@ struct CShuffleEpilogue
     */
    CK_TILE_HOST_DEVICE static constexpr index_t GetVectorSizeC()
    {
+        if constexpr(FixedVectorSize)
+        {
+            return VectorSizeC;
+        }
        constexpr index_t max_vector_size = 16;
        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
        {
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -1,8 +1,7 @@
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/host/concat.hpp"

 namespace ck_tile {
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -121,7 +121,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy

        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
        {
-            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M1           = Problem::VectorSizeA;
            constexpr index_t M0           = MPerBlock / M1;
            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
            static_assert(total_pixels % M1 == 0);
@@ -211,7 +211,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy

        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
        {
-            constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t N1           = Problem::VectorSizeB;
            constexpr index_t N0           = NPerBlock / N1;
            constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
            static_assert(total_pixels % N1 == 0);
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -14,7 +14,10 @@ template <typename ADataType_,
          typename CDataType_,
          typename BlockGemmShape_,
          typename Traits_,
-          typename ComputeDataType_ = ADataType_>
+          typename ComputeDataType_ = ADataType_,
+          bool FixedVectorSize_     = false,
+          index_t VectorSizeA_      = 1,
+          index_t VectorSizeB_      = 1>
 struct GemmPipelineProblemBase
 {
    using Traits = remove_cvref_t<Traits_>;
@@ -24,6 +27,8 @@ struct GemmPipelineProblemBase
    using CDataType       = remove_cvref_t<CDataType_>;
    using ComputeDataType = remove_cvref_t<ComputeDataType_>;

+    static constexpr bool FixedVectorSize = FixedVectorSize_;
+
    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;

    using ALayout = remove_cvref_t<typename Traits::ALayout>;
@@ -115,7 +120,11 @@ struct GemmPipelineProblemBase
    }

    static constexpr index_t VectorSizeA = []() {
-        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        if constexpr(FixedVectorSize)
+        {
+            return VectorSizeA_;
+        }
+        else if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
        {
            return kPadK ? 1 : GetAlignmentA();
        }
@@ -126,7 +135,11 @@ struct GemmPipelineProblemBase
    }();

    static constexpr index_t VectorSizeB = []() {
-        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+        if constexpr(FixedVectorSize)
+        {
+            return VectorSizeB_;
+        }
+        else if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
        {
            return kPadN ? 1 : GetAlignmentB();
        }
@@ -153,13 +166,19 @@ template <typename ADataType_,
          typename CDataType_,
          typename BlockGemmShape_,
          typename Traits_,
-          typename ComputeDataType_ = ADataType_>
+          typename ComputeDataType_ = ADataType_,
+          bool FixedVectorSize_     = false,
+          index_t VectorSizeA_      = 1,
+          index_t VectorSizeB_      = 1>
 using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
                                                    BDataType_,
                                                    CDataType_,
                                                    BlockGemmShape_,
                                                    Traits_,
-                                                    ComputeDataType_>;
+                                                    ComputeDataType_,
+                                                    FixedVectorSize_,
+                                                    VectorSizeA_,
+                                                    VectorSizeB_>;

 template <typename ADataType_,
          typename BDataType_,
@@ -169,7 +188,10 @@ template <typename ADataType_,
          GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
          bool HasHotLoop_                 = true,
          TailNumber TailNum_              = TailNumber::Full,
-          typename ComputeDataType_        = ADataType_>
+          typename ComputeDataType_        = ADataType_,
+          bool FixedVectorSize_            = false,
+          index_t VectorSizeA_             = 1,
+          index_t VectorSizeB_             = 1>
 struct UniversalGemmPipelineProblem
 {
    using Traits = remove_cvref_t<Traits_>;
@@ -179,6 +201,10 @@ struct UniversalGemmPipelineProblem
    using CDataType       = remove_cvref_t<CDataType_>;
    using ComputeDataType = remove_cvref_t<ComputeDataType_>;

+    static constexpr bool FixedVectorSize = FixedVectorSize_;
+    static constexpr index_t VectorSizeA  = VectorSizeA_;
+    static constexpr index_t VectorSizeB  = VectorSizeB_;
+
    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;

    using ALayout = remove_cvref_t<typename Traits::ALayout>;
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -426,10 +426,11 @@ struct UniversalGemmBasePolicy
    {
        using ALayout = remove_cvref_t<typename Problem::ALayout>;

-        constexpr index_t BlockSize     = Problem::kBlockSize;
-        constexpr index_t MPerBlock     = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize   = GetVectorSizeA<Problem>();
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize =
+            Problem::FixedVectorSize ? Problem::VectorSizeA : GetVectorSizeA<Problem>();
        constexpr index_t NumWaveGroups = Problem::NumWaveGroups;

        // Tile: MPerBlock X KPerBlock
@@ -461,10 +462,11 @@ struct UniversalGemmBasePolicy
    {
        using BLayout = remove_cvref_t<typename Problem::BLayout>;

-        constexpr index_t BlockSize     = Problem::kBlockSize;
-        constexpr index_t NPerBlock     = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize   = GetVectorSizeB<Problem>();
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize =
+            Problem::FixedVectorSize ? Problem::VectorSizeB : GetVectorSizeB<Problem>();
        constexpr index_t NumWaveGroups = Problem::NumWaveGroups;

        // Tile: KPerBlock X NPerBlock
--- a/include/ck_tile/ops/grouped_convolution.hpp
+++ b/include/ck_tile/ops/grouped_convolution.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -0,0 +1,800 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+
+namespace ck_tile {
+
+/// @brief The Grouped Convolution kernel device arguments.
+template <typename GroupedConvTraitsType>
+struct GroupedConvFwdKernelArgs
+{
+
+    using ConvToGemmFwdTransformer =
+        TransformConvFwdToGemm<GroupedConvTraitsType::NDimSpatial,
+                               GroupedConvTraitsType::ConvSpecialization>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0])};
+
+        k_batch = args.k_batch;
+
+        GemmM = args.N_ * args.output_spatial_lengths_[0];
+        GemmN = args.K_;
+        GemmK = args.C_ * args.filter_spatial_lengths_[0];
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                          wei_g_k_c_xs_lengths,
+                                                          out_g_n_k_wos_lengths,
+                                                          conv_filter_strides,
+                                                          conv_filter_dilations,
+                                                          input_left_pads,
+                                                          input_right_pads};
+
+        a_grid_desc_m_k =
+            conv_to_gemm_transformer
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+        b_grid_desc_n_k =
+            conv_to_gemm_transformer
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+        c_grid_desc_m_n =
+            conv_to_gemm_transformer
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+
+        group_stride_a = args.C_;
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+        group_stride_c = args.K_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                static_cast<index_t>(args.input_spatial_lengths_[1])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[1])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                               static_cast<index_t>(args.conv_filter_strides_[1])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                           static_cast<index_t>(args.input_left_pads_[1])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                            static_cast<index_t>(args.input_right_pads_[1])};
+
+        k_batch = args.k_batch;
+
+        GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
+        GemmN = args.K_;
+        GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1];
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                          wei_g_k_c_xs_lengths,
+                                                          out_g_n_k_wos_lengths,
+                                                          conv_filter_strides,
+                                                          conv_filter_dilations,
+                                                          input_left_pads,
+                                                          input_right_pads};
+
+        a_grid_desc_m_k =
+            conv_to_gemm_transformer
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+        b_grid_desc_n_k =
+            conv_to_gemm_transformer
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+        c_grid_desc_m_n =
+            conv_to_gemm_transformer
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+
+        group_stride_a = args.C_;
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+        group_stride_c = args.K_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NDHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                static_cast<index_t>(args.input_spatial_lengths_[1]),
+                                static_cast<index_t>(args.input_spatial_lengths_[2])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[1]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[2])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[2])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                               static_cast<index_t>(args.conv_filter_strides_[1]),
+                               static_cast<index_t>(args.conv_filter_strides_[2])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[2])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                           static_cast<index_t>(args.input_left_pads_[1]),
+                           static_cast<index_t>(args.input_left_pads_[2])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                            static_cast<index_t>(args.input_right_pads_[1]),
+                            static_cast<index_t>(args.input_right_pads_[2])};
+
+        k_batch = args.k_batch;
+
+        GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1] *
+                args.output_spatial_lengths_[2];
+        GemmN = args.K_;
+        GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1] *
+                args.filter_spatial_lengths_[2];
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                          wei_g_k_c_xs_lengths,
+                                                          out_g_n_k_wos_lengths,
+                                                          conv_filter_strides,
+                                                          conv_filter_dilations,
+                                                          input_left_pads,
+                                                          input_right_pads};
+
+        a_grid_desc_m_k =
+            conv_to_gemm_transformer
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+        b_grid_desc_n_k =
+            conv_to_gemm_transformer
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+        c_grid_desc_m_n =
+            conv_to_gemm_transformer
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+
+        group_stride_a = args.C_;
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+        group_stride_c = args.K_;
+    }
+
+    using AGridDescMK = remove_cvref_t<decltype(
+        ConvToGemmFwdTransformer{}
+            .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>())>;
+    using BGridDescNK = remove_cvref_t<decltype(
+        ConvToGemmFwdTransformer{}
+            .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>())>;
+    using CGridDescMN = remove_cvref_t<decltype(
+        ConvToGemmFwdTransformer{}
+            .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>())>;
+
+    static constexpr index_t NonSpatialDims = 3;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> in_g_n_c_wis_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> wei_g_k_c_xs_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> out_g_n_k_wos_lengths;
+
+    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_strides;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_dilations;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> input_left_pads;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> input_right_pads;
+
+    index_t k_batch;
+    index_t GemmM;
+    index_t GemmN;
+    index_t GemmK;
+
+    const void* in_ptr;
+    const void* wei_ptr;
+    std::array<const void*, NumDTensor> ds_ptr;
+    void* out_ptr;
+
+    AGridDescMK a_grid_desc_m_k;
+    BGridDescNK b_grid_desc_n_k;
+    CGridDescMN c_grid_desc_m_n;
+
+    long_index_t group_stride_a;
+    long_index_t group_stride_b;
+    long_index_t group_stride_c;
+};
+
+/// @brief The Grouped Convolution Forward kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the grouped convolution forward kernel template. By semantic
+///            division of Implicit GEMM algorithm into following parts we achieve flexible,
+///            versatile and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// @tparam GroupedConvTraitsType       The type of class providing traits for grouped convolution.
+/// @tparam TilePartitioner_            The type of class providing mapping of workgroup index into
+/// the
+///                                     output data tile to be calculated. It determines the
+///                                     workgroup to data relationship (or in other words - which
+///                                     data would be processed and calculated by which workgroup).
+/// @tparam GemmPipeline_               The type of class which provides the core part of matrix
+///                                     multiplication. This class should provide implementation of
+///                                     data loading from global memory and performing block-wise
+///                                     matrix multiplication. You can think of it as a work done by
+///                                     single workgroup point of view.
+/// @tparam EpiloguePipeline_           The type of class providing the final part of matrix
+///                                     multiplication implementation. It is responsible for storing
+///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                                     the output C tensor in global memory.
+template <typename GroupedConvTraitsType,
+          typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_>
+struct GroupedConvolutionForwardKernel
+{
+    static constexpr index_t NDimSpatial = GroupedConvTraitsType::NDimSpatial;
+    static constexpr ConvolutionSpecialization ConvSpecialization =
+        GroupedConvTraitsType::ConvSpecialization;
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+    using GemmALayout      = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using GemmBLayout      = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using GemmCLayout      = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType::InLayout>;
+    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType::WeiLayout>;
+    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType::OutLayout>;
+    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType::DsLayout>;
+
+    using GemmDsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+
+    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using DsDataType  = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+    // Below type is actually accumulation data type - the output of block GEMM.
+    using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<GroupedConvTraitsType>;
+
+    // TODO: Enable this
+    static constexpr bool IsSplitKSupported = false;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>();
+
+    static_assert(GemmPipeline::kPadM && GemmPipeline::kPadN && GemmPipeline::kPadK,
+                  "Not supported!");
+    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "grouped_convolution_forward", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const GroupedConvHostArgs& args)
+    {
+        const index_t GemmM = args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
+                                                        args.output_spatial_lengths_.end(),
+                                                        1,
+                                                        std::multiplies<index_t>());
+        const index_t GemmN = args.K_;
+        return dim3(TilePartitioner::GridSize(GemmM, GemmN), args.G_, args.k_batch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
+    MakeKernelArgs(const GroupedConvHostArgs& hostArgs)
+    {
+        return GroupedConvFwdKernelArgsSpecialized(hostArgs);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_HOST static bool IsSupportedArgument(const GroupedConvFwdKernelArgsSpecialized& kargs)
+    {
+        if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                      is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
+                     !IsSplitKSupported)
+        {
+            if(kargs.k_batch != 1)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+                }
+                return false;
+            }
+        }
+
+        const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
+        const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];
+
+        // check ConvolutionSpecialization
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t ConvStride = kargs.conv_filter_strides[i];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            if(ConvC != 1)
+            {
+                return false;
+            }
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t filter_spatial_dim = kargs.wei_g_k_c_xs_lengths[i + I3];
+
+                if(filter_spatial_dim != I3)
+                {
+                    return false;
+                }
+            }
+        }
+
+        namespace ctc = tensor_layout::convolution;
+
+        if constexpr(std::is_same_v<InLayout, ctc::NWGC> || std::is_same_v<InLayout, ctc::NHWGC> ||
+                     std::is_same_v<InLayout, ctc::NDHWGC>)
+        {
+            // Check access per C
+            if(ConvC % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported input layout!");
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(std::is_same_v<WeiLayout, ctc::GKXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKYXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKZYXC>)
+        {
+            if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported weight layout!");
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(std::is_same_v<OutLayout, ctc::NWGK> ||
+                     std::is_same_v<OutLayout, ctc::NHWGK> ||
+                     std::is_same_v<OutLayout, ctc::NDHWGK>)
+        {
+            if(ConvK % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported output layout!");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto
+    MakeGemmTensorViews(const InDataType* a_ptr,
+                        const WeiDataType* b_ptr,
+                        const std::array<const void*, NumDTensor>& ds_ptr,
+                        OutDataType* c_ptr,
+                        const GroupedConvFwdKernelArgsSpecialized& kargs)
+    {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteB, "Not implemented!");
+        const auto& a_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(a_ptr, kargs.a_grid_desc_m_k);
+        }();
+
+        const auto& b_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(b_ptr, kargs.b_grid_desc_n_k);
+        }();
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& c_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(c_ptr, kargs.c_grid_desc_m_n);
+        }();
+
+        const auto& ds_tensor_view = generate_tuple(
+            [&](auto i) {
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsLayout>, OutLayout>,
+                              "Not supported!");
+                static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
+                              "Not supported!");
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsDataType>, OutDataType>,
+                              "Not supported!");
+
+                return make_tensor_view<address_space_enum::global>(
+                    static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
+            },
+            number<NumDTensor>{});
+
+        return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            return pad_tensor_view(a_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I1);
+            return pad_tensor_view(b_tensor_view,
+                                   make_tuple(number<TilePartitioner::NPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& ds_tensor_view = views.at(I2);
+        const auto& ds_pad_view    = generate_tuple(
+            [&](auto i) {
+                return pad_tensor_view(ds_tensor_view[i],
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<true, true>{});
+            },
+            number<NumDTensor>{});
+
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I3);
+            return pad_tensor_view(c_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::NPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        return make_tuple(a_pad_view, b_pad_view, ds_pad_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& a_pad_view  = views.at(I0);
+        const auto& b_pad_view  = views.at(I1);
+        const auto& ds_pad_view = views.at(I2);
+        const auto& c_pad_view  = views.at(I3);
+
+        const auto& a_block_window = [&]() {
+            return make_tile_window(a_pad_view,
+                                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_m, 0});
+        }();
+
+        const auto& b_block_window = [&]() {
+            return make_tile_window(b_pad_view,
+                                    make_tuple(number<TilePartitioner::NPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_n, 0});
+        }();
+
+        const auto ds_block_window = generate_tuple(
+            [&](auto i) {
+                return make_tile_window(ds_pad_view[i],
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::NPerBlock>{}),
+                                        {i_m, i_n});
+            },
+            number<NumDTensor>{});
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, b_block_window, ds_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm(const InDataType* a_ptr,
+                                       const WeiDataType* b_ptr,
+                                       const std::array<const void*, NumDTensor>& ds_ptr,
+                                       OutDataType* c_ptr,
+                                       void* smem_ptr_0,
+                                       const GroupedConvFwdKernelArgsSpecialized& kargs,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop =
+            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
+     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
+     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm2LDS(const InDataType* a_ptr,
+                                           const WeiDataType* b_ptr,
+                                           const std::array<const void*, NumDTensor>& ds_ptr,
+                                           OutDataType* c_ptr,
+                                           void* __restrict__ smem_ptr_0,
+                                           void* __restrict__ smem_ptr_1,
+                                           const GroupedConvFwdKernelArgsSpecialized& kargs,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop =
+            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0, smem_ptr_1);
+    }
+
+    CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const
+    {
+        const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto [iM, iN] =
+            TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
+        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+
+        // options
+        const InDataType* a_ptr  = static_cast<const InDataType*>(kargs.in_ptr) + group_offset_a;
+        const WeiDataType* b_ptr = static_cast<const WeiDataType*>(kargs.wei_ptr) + group_offset_b;
+        OutDataType* c_ptr       = static_cast<OutDataType*>(kargs.out_ptr) + group_offset_c;
+
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+        {
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm2LDS(
+                    a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, smem_ptr_1, kargs, i_m, i_n);
+            }
+        }
+        else
+        {
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm(a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, kargs, i_m, i_n);
+            }
+        }
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+namespace ck_tile {
+
+enum struct ConvolutionSpecialization
+{
+    Default,
+    Filter1x1Pad0,
+    Filter1x1Stride1Pad0,
+    Filter3x3,
+};
+
+CK_TILE_HOST std::string getConvSpecializationString(const ConvolutionSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionSpecialization::Default: return "Default";
+    case ConvolutionSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
+    case ConvolutionSpecialization::Filter3x3: return "Filter3x3";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+
+namespace ck_tile {
+
+/// @brief The Grouped Conv kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to Grouped Convolution Kernels when creating kernel
+///      arguments object. It contain all necessary information required to
+///      build proper kernel argument and launch kernel on GPU.
+struct GroupedConvHostArgs : public conv::ConvParam
+{
+    CK_TILE_HOST GroupedConvHostArgs() = delete;
+    CK_TILE_HOST GroupedConvHostArgs(ConvParam conv_param,
+                                     const void* in_ptr_,
+                                     const void* wei_ptr_,
+                                     const std::vector<const void*> ds_ptr_,
+                                     void* out_ptr_,
+                                     index_t k_batch_)
+        : conv::ConvParam(conv_param),
+          in_ptr(in_ptr_),
+          wei_ptr(wei_ptr_),
+          ds_ptr(ds_ptr_),
+          out_ptr(out_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* in_ptr;
+    const void* wei_ptr;
+    const std::vector<const void*> ds_ptr;
+    void* out_ptr;
+    index_t k_batch;
+};
+
+template <index_t NDimSpatial_,
+          ConvolutionSpecialization ConvSpecialization_,
+          typename InLayout_,
+          typename WeiLayout_,
+          typename DsLayout_,
+          typename OutLayout_>
+struct GroupedConvTraits
+{
+    private:
+    static constexpr auto generate_implicit_gemm_layout()
+    {
+        return generate_tuple([](auto) { return ck_tile::tensor_layout::gemm::RowMajor{}; },
+                              number<DsLayout_::size()>{});
+    }
+
+    public:
+    static constexpr index_t NDimSpatial                          = NDimSpatial_;
+    static constexpr ConvolutionSpecialization ConvSpecialization = ConvSpecialization_;
+    using InLayout                                                = InLayout_;
+    using WeiLayout                                               = WeiLayout_;
+    using DsLayout                                                = DsLayout_;
+    using OutLayout                                               = OutLayout_;
+    using GroupedConvImplicitGemmTraits                           = TileGemmTraits<true,
+                                                         true,
+                                                         true,
+                                                         ck_tile::tensor_layout::gemm::RowMajor,
+                                                         ck_tile::tensor_layout::gemm::ColumnMajor,
+                                                         ck_tile::tensor_layout::gemm::RowMajor>;
+    static constexpr index_t NumDTensor                           = DsLayout::size();
+    using ImplicitGemmDsLayout = decltype(generate_implicit_gemm_layout());
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp