diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index a2d7fe4aaf..b8ca26193d 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -47,7 +47,7 @@ set(FMHA_BWD_CODE_GEN_COMMON_ARGS
   ${CMAKE_CURRENT_LIST_DIR}/generate.py
   --api bwd
   --receipt 3
-  --optdim 32,64,128,256
+  --optdim 32,64,96,128,256
   # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
 )
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 4df99a9a10..36482e94c1 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -380,6 +380,7 @@ def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize]
         return [
             FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 32, 128,  96, 32,  96, 32, 32,  96,  96, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
             # FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh
index e7babd2744..5c2a5a4b3d 100755
--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
@@ -34,15 +34,15 @@ function print_log_header(){
 }
 
 #run verification tests
-example/ck_tile/01_fmha/script/smoke_test_fwd.sh
-example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+time example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+time example/ck_tile/01_fmha/script/smoke_test_bwd.sh
 
 #run performance benchmarks
 export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
 print_log_header $fmha_fwd_log $env_type $branch $host_name
-example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
+time example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
 
 export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
 print_log_header $fmha_bwd_log $env_type $branch $host_name
-example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
+time example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
 
diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
index 3b59505ff0..ec2a6a0ceb 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
@@ -6,7 +6,7 @@ SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
 EXE_NAME=tile_example_fmha_bwd
 EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
 KNAME=1
-GPU_arch=$GPU_arch
+GPU_arch=${GPU_arch:-""}
 if [ -z "$GPU_arch" ] ; then
     GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
 fi
@@ -31,7 +31,17 @@ run_exe() {
     set -ex
 }
 
+test_h_s_mask() {
+    run_exe -b=1 -h=4 -h_k=2 -s=259                         $@
+    run_exe -b=2 -h=2        -s=516 -s_k=253                $@
+    run_exe -b=1 -h=4 -h_k=1 -s=500 -s_k=251 -mask=1        $@
+    run_exe -b=1 -h=2        -s=900 -s_k=258 -mask=2        $@
+    run_exe -b=2 -h=1        -s=987 -s_k=219 -mask=t:128,30 $@
+    run_exe -b=2 -h=3 -h_k=1 -s=244 -s_k=499 -mask=b:4,35   $@
+}
+
 set -x
+# main tests
 for prec in "fp16" "bf16" ; do
 for perm in 0 1 ; do
 for hdim in 32 64 128 256 ; do
@@ -40,21 +50,21 @@ for bias in "n" "a" ; do
 for dbias in 0 ; do
 for p_drop in 0.0 0.2 ; do
 for deterministic in 0 ; do
+test_h_s_mask -prec=$prec -d=$hdim -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+done
+done
+done
+done
+done
+done
+done
+done
 
-run_exe -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259          -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm                -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-run_exe -prec=$prec -b=2 -h=2        -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm                -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-run_exe -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1        -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-run_exe -prec=$prec -b=1 -h=2        -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2        -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-run_exe -prec=$prec -b=2 -h=1        -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-run_exe -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35   -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-
-done
-done
-done
-done
-done
-done
-done
+# additional cases
+for hdim in 72 96 ; do
+test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=0 -operm=0 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
+test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
+test_h_s_mask -prec=bf16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
 done
 set +x
 
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
index 4f9362beb2..fa914a7119 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
@@ -11,199 +11,14 @@
 
 #include "ck_tile/host.hpp"
 #include "grouped_convolution_utils.hpp"
-
-template <ck_tile::index_t NDimSpatial,
-          typename GemmWarpConfig,
-          typename InDataType,
-          typename WeiDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
-          typename DsDataType     = ck_tile::tuple<>,
-          typename DsLayout       = ck_tile::tuple<>,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
-                            const ck_tile::stream_config& s)
-{
-    constexpr int kBlockPerCu = 1;
-
-    constexpr ck_tile::index_t M_Tile = 64;
-    constexpr ck_tile::index_t N_Tile = 64;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
-
-    constexpr ck_tile::index_t VectorSizeA = 1;
-    constexpr ck_tile::index_t VectorSizeB = 1;
-    constexpr ck_tile::index_t VectorSizeC = 8;
-
-    // Implicit GEMM Traits
-    using CodegenShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
-    using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
-    using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
-                                                              ConvSpec,
-                                                              InLayout,
-                                                              WeiLayout,
-                                                              DsLayout,
-                                                              OutLayout,
-                                                              VectorSizeA,
-                                                              VectorSizeB,
-                                                              VectorSizeC>;
-    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CodegenShape,
-        typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
-        ck_tile::element_wise::PassThrough,
-        ck_tile::element_wise::PassThrough,
-        InDataType,
-        true,
-        GroupedConvTraitsType::VectorSizeA,
-        GroupedConvTraitsType::VectorSizeB>;
-    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using ConvEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<InDataType,
-                                             WeiDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                                             ck_tile::tensor_layout::gemm::RowMajor,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation,
-                                             1,
-                                             true,
-                                             GroupedConvTraitsType::VectorSizeC>>;
-
-        using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
-                                                                     TilePartitioner,
-                                                                     CodegenPipeline,
-                                                                     ConvEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(args);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << '\n'
-                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
-                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
-                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    if(args.k_batch == 1)
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
-    }
-}
-
+#include "grouped_convolution_backward_data_invoker.hpp"
 #include "run_grouped_convolution_bwd_data_example.inc"
 
-template <typename GemmWarpConfig,
-          typename InPrecType,
-          typename WeiPrecType = InPrecType,
-          typename OutPrecType = InPrecType>
-int run_grouped_conv_bwd_data_example_prec_type(
-    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
-{
-    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
-    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
-    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
-
-    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
-    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
-    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
-
-    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
-    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
-    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
-
-    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
-                                                              GemmWarpConfig,
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NWGC{}, GKXC{}, NWGK{});
-    }
-    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
-                                                              GemmWarpConfig,
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
-    }
-    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
-    {
-        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
-                                                              GemmWarpConfig,
-                                                              InPrecType,
-                                                              WeiPrecType,
-                                                              OutPrecType>(
-            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported memory layout!");
-    }
-}
-
 template <typename GemmWarpConfig>
 int run_grouped_conv_bwd_data_example(int argc, char* argv[])
 {
+    using Invoker = GroupedConvolutionBackwardDataInvoker;
+
     auto [result, arg_parser] = create_args(argc, argv);
     if(!result)
         return -1;
@@ -215,12 +30,16 @@ int run_grouped_conv_bwd_data_example(int argc, char* argv[])
 
     if(data_type == "fp16")
     {
-        return run_grouped_conv_bwd_data_example_prec_type<GemmWarpConfig, ck_tile::half_t>(
+        return run_grouped_conv_bwd_data_example_prec_type<Invoker,
+                                                           GemmWarpConfig,
+                                                           ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
-        return run_grouped_conv_bwd_data_example_prec_type<GemmWarpConfig, ck_tile::bf16_t>(
+        return run_grouped_conv_bwd_data_example_prec_type<Invoker,
+                                                           GemmWarpConfig,
+                                                           ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
new file mode 100644
index 0000000000..1b3d45427d
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionBackwardDataInvoker
+{
+
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
+                                       const ck_tile::stream_config& s)
+    {
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 1;
+        constexpr ck_tile::index_t VectorSizeB = 1;
+        constexpr ck_tile::index_t VectorSizeC = 8;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            InDataType,
+            WeiDataType,
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                InDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                OutDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
+                                                                         TilePartitioner,
+                                                                         CodegenPipeline,
+                                                                         ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(args);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            float ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        if(args.k_batch == 1)
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                                  ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    }
+};
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
index cebfa90579..4cddbae3ab 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -11,190 +11,14 @@
 
 #include "ck_tile/host.hpp"
 #include "grouped_convolution_utils.hpp"
-
-template <ck_tile::index_t NDimSpatial,
-          typename GemmWarpConfig,
-          typename InDataType,
-          typename WeiDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout,
-          typename DsDataType     = ck_tile::tuple<>,
-          typename DsLayout       = ck_tile::tuple<>,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_tile::stream_config& s)
-{
-    constexpr int kBlockPerCu = 1;
-
-    constexpr ck_tile::index_t M_Tile = 64;
-    constexpr ck_tile::index_t N_Tile = 64;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
-    constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
-    constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
-
-    constexpr ck_tile::index_t VectorSizeA = 8;
-    constexpr ck_tile::index_t VectorSizeB = 8;
-    constexpr ck_tile::index_t VectorSizeC = 8;
-
-    // Implicit GEMM Traits
-    using CodegenShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
-    using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
-    using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
-                                                              ConvSpec,
-                                                              InLayout,
-                                                              WeiLayout,
-                                                              DsLayout,
-                                                              OutLayout,
-                                                              VectorSizeA,
-                                                              VectorSizeB,
-                                                              VectorSizeC>;
-    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CodegenShape,
-        typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd,
-        ck_tile::element_wise::PassThrough,
-        ck_tile::element_wise::PassThrough,
-        InDataType,
-        true,
-        GroupedConvTraitsType::VectorSizeA,
-        GroupedConvTraitsType::VectorSizeB>;
-    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto memory_operation = memory_operation_.value;
-
-        using ConvEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<InDataType,
-                                             WeiDataType,
-                                             DsDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                                             ck_tile::tensor_layout::gemm::RowMajor,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC,
-                                             memory_operation,
-                                             1,
-                                             true,
-                                             GroupedConvTraitsType::VectorSizeC>>;
-
-        using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
-                                                                TilePartitioner,
-                                                                CodegenPipeline,
-                                                                ConvEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
-
-        const dim3 grids  = Kernel::GridSize(kargs);
-        const dim3 blocks = Kernel::BlockSize();
-
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-        }
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << CodegenShape::GetName() << '\n'
-                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << '\n'
-                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
-                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
-                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-        }
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    };
-
-    return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                          ck_tile::memory_operation_enum::set>{});
-}
-
+#include "grouped_convolution_forward_invoker.hpp"
 #include "run_grouped_convolution_fwd_example.inc"
 
-template <typename GemmWarpConfig,
-          typename InPrecType,
-          typename WeiPrecType = InPrecType,
-          typename OutPrecType = InPrecType>
-int run_grouped_conv_fwd_example_prec_type(
-    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
-{
-    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
-    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
-    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
-
-    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
-    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
-    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
-
-    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
-    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
-    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
-
-    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
-                                                         GemmWarpConfig,
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NWGC{}, GKXC{}, NWGK{});
-    }
-    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
-                                                         GemmWarpConfig,
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
-    }
-    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC")
-    {
-        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
-                                                         GemmWarpConfig,
-                                                         InPrecType,
-                                                         WeiPrecType,
-                                                         OutPrecType>(
-            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported memory layout!");
-    }
-}
-
 template <typename GemmWarpConfig>
 int run_grouped_conv_fwd_example(int argc, char* argv[])
 {
+    using Invoker = GroupedConvolutionForwardInvoker;
+
     auto [result, arg_parser] = create_args(argc, argv);
     if(!result)
         return -1;
@@ -206,12 +30,12 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])
 
     if(data_type == "fp16")
     {
-        return run_grouped_conv_fwd_example_prec_type<GemmWarpConfig, ck_tile::half_t>(
+        return run_grouped_conv_fwd_example_prec_type<Invoker, GemmWarpConfig, ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
-        return run_grouped_conv_fwd_example_prec_type<GemmWarpConfig, ck_tile::bf16_t>(
+        return run_grouped_conv_fwd_example_prec_type<Invoker, GemmWarpConfig, ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
new file mode 100644
index 0000000000..0b9879d247
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include "grouped_convolution_utils.hpp"
+
+struct GroupedConvolutionForwardInvoker
+{
+    template <ck_tile::index_t NDimSpatial,
+              typename GemmWarpConfig,
+              typename InDataType,
+              typename WeiDataType,
+              typename AccDataType,
+              typename OutDataType,
+              typename InLayout,
+              typename WeiLayout,
+              typename OutLayout,
+              typename DsDataType     = ck_tile::tuple<>,
+              typename DsLayout       = ck_tile::tuple<>,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    static float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                                  const ck_tile::stream_config& s)
+    {
+        constexpr int kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t M_Tile = 64;
+        constexpr ck_tile::index_t N_Tile = 64;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile;
+        constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile;
+        constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile;
+
+        constexpr ck_tile::index_t VectorSizeA = 8;
+        constexpr ck_tile::index_t VectorSizeB = 8;
+        constexpr ck_tile::index_t VectorSizeC = 8;
+
+        // Implicit GEMM Traits
+        using CodegenShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        constexpr auto ConvSpec      = ck_tile::ConvolutionSpecialization::Default;
+        using TilePartitioner        = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+        using GroupedConvTraitsType  = ck_tile::GroupedConvTraits<NDimSpatial,
+                                                                  ConvSpec,
+                                                                  InLayout,
+                                                                  WeiLayout,
+                                                                  DsLayout,
+                                                                  OutLayout,
+                                                                  VectorSizeA,
+                                                                  VectorSizeB,
+                                                                  VectorSizeC>;
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<
+            InDataType,
+            WeiDataType,
+            AccDataType,
+            CodegenShape,
+            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd,
+            ck_tile::element_wise::PassThrough,
+            ck_tile::element_wise::PassThrough,
+            InDataType,
+            true,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
+        using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                InDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                OutDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                ck_tile::tensor_layout::gemm::RowMajor,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                M_Warp,
+                N_Warp,
+                M_Warp_Tile,
+                N_Warp_Tile,
+                K_Warp_Tile,
+                CodegenPipelineProblem::TransposeC,
+                memory_operation,
+                1,
+                true,
+                GroupedConvTraitsType::VectorSizeC>>;
+
+            using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                    TilePartitioner,
+                                                                    CodegenPipeline,
+                                                                    ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(kargs);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << CodegenShape::GetName() << '\n'
+                          << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
+
+            float ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+};
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
index 8519daaac2..3d7635bf4f 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
@@ -4,6 +4,7 @@
 
 template <ck_tile::index_t NDimSpatial,
           typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType,
           typename AccDataType,
@@ -15,15 +16,15 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
                                    int n_warmup,
                                    int n_repeat)
 {
-    float ave_time = grouped_conv_bwd_data<NDimSpatial,
-                                           GemmWarpConfig,
-                                           InDataType,
-                                           WeiDataType,
-                                           AccDataType,
-                                           OutDataType,
-                                           InLayout,
-                                           WeiLayout,
-                                           OutLayout>(
+    float ave_time = Invoker::template grouped_conv_bwd_data<NDimSpatial,
+                                                             GemmWarpConfig,
+                                                             InDataType,
+                                                             WeiDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             InLayout,
+                                                             WeiLayout,
+                                                             OutLayout>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = args.GetFlops();
@@ -39,6 +40,7 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
 
 template <ck_tile::index_t NDimSpatial,
           typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType = InDataType,
           typename OutDataType = InDataType,
@@ -140,6 +142,7 @@ int run_grouped_conv_bwd_data_example_with_layouts(
 
     invoke_grouped_conv_bwd_data<NDimSpatial,
                                  GemmWarpConfig,
+                                 Invoker,
                                  InDataType,
                                  WeiDataType,
                                  AccDataType,
@@ -188,3 +191,59 @@ int run_grouped_conv_bwd_data_example_with_layouts(
 
     return pass;
 }
+
+template <typename Invoker,
+          typename GemmWarpConfig,
+          typename InPrecType,
+          typename WeiPrecType = InPrecType,
+          typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_data_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
+                                                              GemmWarpConfig,
+                                                              Invoker,
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
index c5ae92a0da..beb6005e19 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
@@ -4,6 +4,7 @@
 
 template <ck_tile::index_t NDimSpatial,
           typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType,
           typename AccDataType,
@@ -15,15 +16,15 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
                               int n_warmup,
                               int n_repeat)
 {
-    float ave_time = grouped_conv_fwd<NDimSpatial,
-                                      GemmWarpConfig,
-                                      InDataType,
-                                      WeiDataType,
-                                      AccDataType,
-                                      OutDataType,
-                                      InLayout,
-                                      WeiLayout,
-                                      OutLayout>(
+    float ave_time = Invoker::template grouped_conv_fwd<NDimSpatial,
+                                                        GemmWarpConfig,
+                                                        InDataType,
+                                                        WeiDataType,
+                                                        AccDataType,
+                                                        OutDataType,
+                                                        InLayout,
+                                                        WeiLayout,
+                                                        OutLayout>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = args.GetFlops();
@@ -39,6 +40,7 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
 
 template <ck_tile::index_t NDimSpatial,
           typename GemmWarpConfig,
+          typename Invoker,
           typename InDataType,
           typename WeiDataType = InDataType,
           typename OutDataType = InDataType,
@@ -140,6 +142,7 @@ int run_grouped_conv_fwd_example_with_layouts(
 
     invoke_grouped_conv_fwd<NDimSpatial,
                             GemmWarpConfig,
+                            Invoker,
                             InDataType,
                             WeiDataType,
                             AccDataType,
@@ -188,3 +191,59 @@ int run_grouped_conv_fwd_example_with_layouts(
 
     return pass;
 }
+
+template <typename Invoker,
+          typename GemmWarpConfig,
+          typename InPrecType,
+          typename WeiPrecType = InPrecType,
+          typename OutPrecType = InPrecType>
+int run_grouped_conv_fwd_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
+                                                         GemmWarpConfig,
+                                                         Invoker,
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index ad9e2959f5..5eac387a66 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -408,8 +408,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
 
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kNPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }
@@ -457,8 +456,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<1>, sequence<2, 0>>,
                                        sequence<1, 2>, // N0 K1
                                        sequence<0, 1>>{});
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kNPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }
@@ -507,8 +505,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
 
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kMPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }
@@ -558,8 +555,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
 
-        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
-                     kMPerBlock * kKPerBlock)
+        if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2
         {
             return dstr;
         }